First pass refactoring scrapers into separate modules (#98)

Splits off the respective scrapers into submodules (mtgs_scraper.py, scryfall_scraper.py, mythic_scraper.py, wizards_scraper.py)
This commit is contained in:
Lee Matos 2017-07-05 21:44:45 -04:00 committed by tritoch
parent 2af17727a4
commit 1dd538d5a1
6 changed files with 1002 additions and 855 deletions

119
main.py
View File

@ -1,5 +1,9 @@
# -*- coding: utf-8 -*-
import spoilers
import mtgs_scraper
import scryfall_scraper
import mythic_scraper
import wizards_scraper
import os
import commentjson
import json
@ -7,17 +11,20 @@ import io
import sys
presets = {
"isfullspoil": False, # when full spoil comes around, we only want to use WOTC images
"includeMasterpieces": True, # if the set has masterpieces, let's get those too
"oldRSS": False, # maybe MTGS hasn't updated their spoiler.rss but new cards have leaked
"isfullspoil": False, # when full spoil comes around, we only want to use WOTC images
"includeMasterpieces": True, # if the set has masterpieces, let's get those too
"oldRSS": False, # maybe MTGS hasn't updated their spoiler.rss but new cards have leaked
"split_cards": {
},
"siteorder": ['scryfall','mtgs','mythicspoiler'], # if we want to use one site before another for card data TODO
"imageorder": ['wotc','scryfall','mtgs','mythicspoiler'], # prioritize images from certain sources
"useexclusively": '', # if we *only* want to use one site TODO
"dumpXML": False, # let travis print XML for testing
"scryfallComparison": False, #if we want to debug compare scryfall to other sources, enable
"dumpErrors": True # print the error log from out/errors.json
# if we want to use one site before another for card data TODO
"siteorder": ['scryfall', 'mtgs', 'mythicspoiler'],
# prioritize images from certain sources
"imageorder": ['wotc', 'scryfall', 'mtgs', 'mythicspoiler'],
"useexclusively": '', # if we *only* want to use one site TODO
"dumpXML": False, # let travis print XML for testing
# if we want to debug compare scryfall to other sources, enable
"scryfallComparison": False,
"dumpErrors": True # print the error log from out/errors.json
}
@ -30,94 +37,114 @@ def load_json(json_file, lib_to_use):
output_file = json.load(data_file)
return output_file
except Exception as ex:
print "Unable to load file: " +json_file+ "\nException information:\n" + str(ex.args)
sys.exit("Unable to load file: "+json_file)
print "Unable to load file: " + json_file + "\nException information:\n" + str(ex.args)
sys.exit("Unable to load file: " + json_file)
setinfos = load_json('set_info','commentjson')
manual_sets = load_json('cards_manual','json')
card_corrections = load_json('cards_corrections','commentjson')
delete_cards = load_json('cards_delete','commentjson')
setinfos = load_json('set_info', 'commentjson')
manual_sets = load_json('cards_manual', 'json')
card_corrections = load_json('cards_corrections', 'commentjson')
delete_cards = load_json('cards_delete', 'commentjson')
errorlog = []
#TODO insert configparser to add config.ini file
# TODO insert configparser to add config.ini file
def parseargs():
for argument in sys.argv:
for preset in presets:
if argument.split('=')[0].lower().replace('-','') == preset.lower():
if argument.split('=')[0].lower().replace('-', '') == preset.lower():
argvalue = argument.split('=')[1]
if argvalue in ['true','True','T','t']:
if argvalue in ['true', 'True', 'T', 't']:
argvalue = True
elif argvalue in ['false','False','F','f']:
elif argvalue in ['false', 'False', 'F', 'f']:
argvalue = False
presets[preset] = argvalue
print "Setting preset " + preset + " to value " + str(argvalue)
def save_allsets(AllSets):
with io.open('out/AllSets.json', 'w', encoding='utf8') as json_file:
data = json.dumps(AllSets, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',',':'))
data = json.dumps(AllSets, ensure_ascii=False, encoding='utf8',
indent=2, sort_keys=True, separators=(',', ':'))
json_file.write(unicode(data))
def save_masterpieces(masterpieces, setinfo):
with open('out/' + setinfo['masterpieces']['setname'] + '.json', 'w') as outfile:
json.dump(masterpieces, outfile, sort_keys=True, indent=2, separators=(',', ': '))
json.dump(masterpieces, outfile, sort_keys=True,
indent=2, separators=(',', ': '))
def save_setjson(mtgs, filename):
with io.open('out/' + filename + '.json', 'w', encoding='utf8') as json_file:
data = json.dumps(mtgs, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',',':'))
data = json.dumps(mtgs, ensure_ascii=False, encoding='utf8',
indent=2, sort_keys=True, separators=(',', ':'))
json_file.write(unicode(data))
def save_errorlog(errorlog):
with open('out/errors.json', 'w') as outfile:
json.dump(errorlog, outfile, sort_keys=True, indent=2, separators=(',', ': '))
json.dump(errorlog, outfile, sort_keys=True,
indent=2, separators=(',', ': '))
def save_xml(xmlstring, outfile):
if os.path.exists(outfile):
append_or_write = 'w'
else:
append_or_write = 'w'
with open(outfile,append_or_write) as xmlfile:
with open(outfile, append_or_write) as xmlfile:
xmlfile.write(xmlstring.encode('utf-8'))
if __name__ == '__main__':
parseargs()
AllSets = spoilers.get_allsets() #get AllSets from mtgjson
AllSets = spoilers.get_allsets() # get AllSets from mtgjson
combinedjson = {}
for setinfo in setinfos:
if setinfo['setname'] in AllSets:
print "Found set from set_info " +setinfo['setname']+ " in MTGJSON, not adding it"
print "Found set from set_info " + setinfo['setname'] + " in MTGJSON, not adding it"
continue
if presets['oldRSS'] or 'noRSS' in setinfo and setinfo['noRSS']:
mtgs = { "cards":[] }
mtgs = {"cards": []}
else:
mtgs = spoilers.scrape_mtgs('http://www.mtgsalvation.com/spoilers.rss') #scrape mtgs rss feed
[mtgs, split_cards] = spoilers.parse_mtgs(mtgs, [], [], [], presets['split_cards']) #parse spoilers into mtgjson format
mtgs = spoilers.correct_cards(mtgs, manual_sets[setinfo['setname']]['cards'], card_corrections, delete_cards) #fix using the fixfiles
mtgjson = spoilers.get_image_urls(mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo) #get images
mtgs = mtgs_scraper.scrape_mtgs(
'http://www.mtgsalvation.com/spoilers.rss') # scrape mtgs rss feed
[mtgs, split_cards] = mtgs_scraper.parse_mtgs(
mtgs, [], [], [], presets['split_cards']) # parse spoilers into mtgjson format
mtgs = spoilers.correct_cards(
mtgs, manual_sets[setinfo['setname']]['cards'], card_corrections, delete_cards) # fix using the fixfiles
mtgjson = spoilers.get_image_urls(
mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo) # get images
if presets['scryfallComparison']:
scryfall = spoilers.get_scryfall(
scryfall = scryfall_scraper.get_scryfall(
'https://api.scryfall.com/cards/search?q=++e:' + setinfo['setname'].lower())
mtgjson = spoilers.smash_mtgs_scryfall(mtgs, scryfall)
mtgjson = scryfall_scraper.smash_mtgs_scryfall(mtgs, scryfall)
if 'fullSpoil' in setinfo and setinfo['fullSpoil']:
wotc = spoilers.scrape_fullspoil('', setinfo)
spoilers.smash_fullspoil(mtgjson, wotc)
[mtgjson, errors] = spoilers.error_check(mtgjson, card_corrections) #check for errors where possible
wotc = wizards_scraper.scrape_fullspoil('', setinfo)
wizards_scraper.smash_fullspoil(mtgjson, wotc)
[mtgjson, errors] = spoilers.error_check(
mtgjson, card_corrections) # check for errors where possible
errorlog += errors
spoilers.write_xml(mtgjson, setinfo['setname'], setinfo['setlongname'], setinfo['setreleasedate'])
spoilers.write_xml(
mtgjson, setinfo['setname'], setinfo['setlongname'], setinfo['setreleasedate'])
#save_xml(spoilers.pretty_xml(setinfo['setname']), 'out/spoiler.xml')
mtgjson = spoilers.add_headers(mtgjson, setinfo)
AllSets = spoilers.make_allsets(AllSets, mtgjson, setinfo['setname'])
if 'masterpieces' in setinfo: #repeat all of the above for masterpieces
#masterpieces aren't in the rss feed, so for the new cards, we'll go to their individual pages on mtgs
#old cards will get their infos copied from mtgjson (including fields that may not apply like 'artist')
#the images will still come from mtgs
masterpieces = spoilers.make_masterpieces(setinfo['masterpieces'], AllSets, mtgjson)
if 'masterpieces' in setinfo: # repeat all of the above for masterpieces
# masterpieces aren't in the rss feed, so for the new cards, we'll go to their individual pages on mtgs
# old cards will get their infos copied from mtgjson (including fields that may not apply like 'artist')
# the images will still come from mtgs
masterpieces = spoilers.make_masterpieces(
setinfo['masterpieces'], AllSets, mtgjson)
[masterpieces, errors] = spoilers.error_check(masterpieces)
errorlog += errors
spoilers.write_xml(masterpieces, setinfo['masterpieces']['setname'], setinfo['masterpieces']['setlongname'], setinfo['masterpieces']['setreleasedate'])
AllSets = spoilers.make_allsets(AllSets, masterpieces, setinfo['masterpieces']['setname'])
spoilers.write_xml(masterpieces, setinfo['masterpieces']['setname'],
setinfo['masterpieces']['setlongname'], setinfo['masterpieces']['setreleasedate'])
AllSets = spoilers.make_allsets(
AllSets, masterpieces, setinfo['masterpieces']['setname'])
save_masterpieces(masterpieces, setinfo)
combinedjson[setinfo['masterpieces']['setname']] = masterpieces
save_setjson(mtgjson, setinfo['setname'])
@ -128,7 +155,7 @@ if __name__ == '__main__':
errorlog = spoilers.remove_corrected_errors(errorlog, card_corrections)
save_errorlog(errorlog)
save_allsets(AllSets)
#save_setjson(mtgjson)
# save_setjson(mtgjson)
if presets['dumpXML']:
print '<!----- DUMPING SPOILER.XML -----!>'
with open('out/spoiler.xml', 'r') as xmlfile:
@ -137,7 +164,7 @@ if __name__ == '__main__':
if presets['dumpErrors']:
if errorlog != {}:
print '//----- DUMPING ERROR LOG -----'
print json.dumps(errorlog, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',',':'))
print json.dumps(errorlog, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',', ':'))
print '//----- END ERROR LOG -----'
else:
print "No Detected Errors!"

269
mtgs_scraper.py Normal file
View File

@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
import requests
import feedparser
import re
import sys
import time
from lxml import html
def scrape_mtgs(url):
return requests.get(url, headers={'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': 'Thu, 01 Jan 1970 00:00:00 GMT'}).text
def parse_mtgs(mtgs, manual_cards=[], card_corrections=[], delete_cards=[], split_cards={}, related_cards=[]):
mtgs = mtgs.replace('utf-16', 'utf-8')
patterns = ['<b>Name:</b> <b>(?P<name>.*?)<',
'Cost: (?P<cost>[X]*\d{0,2}[XWUBRGC]*?)<',
'Type: (?P<type>.*?)<',
'Pow/Tgh: (?P<pow>.*?)<',
'Rules Text: (?P<rules>.*?)<br /',
'Rarity: (?P<rarity>.*?)<',
'Set Number: #(?P<setnumber>.*?)/'
]
d = feedparser.parse(mtgs)
cards = []
for entry in d.items()[5][1]:
card = dict(cost='', cmc='', img='', pow='', name='', rules='', type='',
color='', altname='', colorIdentity='', colorArray=[], colorIdentityArray=[], setnumber='', rarity='')
summary = entry['summary']
for pattern in patterns:
match = re.search(pattern, summary, re.MULTILINE | re.DOTALL)
if match:
dg = match.groupdict()
card[dg.items()[0][0]] = dg.items()[0][1]
cards.append(card)
# if we didn't find any cards, let's bail out to prevent overwriting good data
count = 0
for card in cards:
count = count + 1
if count < 1:
sys.exit("No cards found, exiting to prevent file overwrite")
cards2 = []
for card in cards:
if 'rules' in card:
htmltags = re.compile(r'<.*?>')
card['rules'] = htmltags.sub('', card['rules'])
if '//' in card['name'] or 'Aftermath' in card['rules']:
print 'Splitting up Aftermath card ' + card['name']
card1 = card.copy()
card2 = dict(cost='', cmc='', img='', pow='', name='', rules='', type='',
color='', altname='', colorIdentity='', colorArray=[], colorIdentityArray=[], setnumber='', rarity='')
if '//' in card['name']:
card['name'] = card['name'].replace(' // ', '//')
card1['name'] = card['name'].split('//')[0]
card2["name"] = card['name'].split('//')[1]
else:
card1['name'] = card['name']
card2["name"] = card['rules'].split(
'\n\n')[1].strip().split(' {')[0]
card1['rules'] = card['rules'].split('\n\n')[0].strip()
card2["rules"] = "Aftermath" + card['rules'].split('Aftermath')[1]
card2['cost'] = re.findall(
r'{.*}', card['rules'])[0].replace('{', '').replace('}', '').upper()
card2['type'] = re.findall(
r'}\n.*\n', card['rules'])[0].replace('}', '').replace('\n', '')
if 'setnumber' in card:
card1['setnumber'] = card['setnumber'] + 'a'
card2['setnumber'] = card['setnumber'] + 'b'
if 'rarity' in card:
card2['rarity'] = card['rarity']
if not card1['name'] in split_cards:
split_cards[card1['name']] = card2['name']
card1['layout'] = 'aftermath'
card2['layout'] = 'aftermath'
cards2.append(card1)
cards2.append(card2)
else:
cards2.append(card)
cards = cards2
for card in cards:
card['name'] = card['name'].replace('&#x27;', '\'')
card['rules'] = card['rules'].replace('&#x27;', '\'') \
.replace('&lt;i&gt;', '') \
.replace('&lt;/i&gt;', '') \
.replace('&quot;', '"') \
.replace('blkocking', 'blocking')\
.replace('&amp;bull;', u'')\
.replace('&bull;', u'')\
.replace('comes into the', 'enters the')\
.replace('threeor', 'three or')\
.replace('[i]', '')\
.replace('[/i]', '')\
.replace('Lawlwss', 'Lawless')\
.replace('Costner', "Counter")
card['type'] = card['type'].replace(' ', ' ')\
.replace('Crature', 'Creature')
if card['type'][-1] == ' ':
card['type'] = card['type'][:-1]
if 'cost' in card and len(card['cost']) > 0:
workingCMC = 0
stripCost = card['cost'].replace('{', '').replace('}', '')
for manaSymbol in stripCost:
if manaSymbol.isdigit():
workingCMC += int(manaSymbol)
elif not manaSymbol == 'X':
workingCMC += 1
card['cmc'] = workingCMC
for c in 'WUBRG': # figure out card's color
if c not in card['colorIdentity']:
if c in card['cost']:
card['color'] += c
card['colorIdentity'] += c
if (c + '}') in card['rules'] or (str.lower(c) + '}') in card['rules']:
if not (c in card['colorIdentity']):
card['colorIdentity'] += c
cleanedcards = []
for card in cards: # let's remove any cards that are named in delete_cards array
if not card['name'] in delete_cards:
cleanedcards.append(card)
cards = cleanedcards
cardarray = []
for card in cards:
dupe = False
for dupecheck in cardarray:
if dupecheck['name'] == card['name']:
dupe = True
if dupe == True:
continue
for cid in card['colorIdentity']:
card['colorIdentityArray'].append(cid)
if 'W' in card['color']:
card['colorArray'].append('White')
if 'U' in card['color']:
card['colorArray'].append('Blue')
if 'B' in card['color']:
card['colorArray'].append('Black')
if 'R' in card['color']:
card['colorArray'].append('Red')
if 'G' in card['color']:
card['colorArray'].append('Green')
cardpower = ''
cardtoughness = ''
if len(card['pow'].split('/')) > 1:
cardpower = card['pow'].split('/')[0]
cardtoughness = card['pow'].split('/')[1]
cardnames = []
cardnumber = card['setnumber'].lstrip('0')
if card['name'] in related_cards:
cardnames.append(card['name'])
cardnames.append(related_cards[card['name']])
cardnumber += 'a'
card['layout'] = 'double-faced'
for namematch in related_cards:
if card['name'] == related_cards[namematch]:
card['layout'] = 'double-faced'
cardnames.append(namematch)
if not card['name'] in cardnames:
cardnames.append(card['name'])
cardnumber += 'b'
cardnames = []
if card['name'] in split_cards:
cardnames.append(card['name'])
cardnames.append(split_cards[card['name']])
cardnumber = cardnumber.replace('b', '').replace('a', '') + 'a'
if not 'layout' in card:
card['layout'] = 'split'
for namematch in split_cards:
if card['name'] == split_cards[namematch]:
if not 'layout' in card or ('layout' in card and card['layout'] == ''):
card['layout'] = 'split'
cardnames.append(namematch)
if not card['name'] in cardnames:
cardnames.append(card['name'])
cardnumber = cardnumber.replace(
'b', '').replace('a', '') + 'b'
if 'number' in card:
if 'b' in card['number'] or 'a' in card['number']:
if not 'layout' in card:
print card['name'] + " has a a/b number but no 'layout'"
card['type'] = card['type'].replace('instant', 'Instant').replace(
'sorcery', 'Sorcery').replace('creature', 'Creature')
if '-' in card['type']:
subtype = card['type'].split(' - ')[1].strip()
else:
subtype = False
if subtype:
subtypes = subtype.split(' ')
else:
subtypes = False
if card['cmc'] == '':
card['cmc'] = 0
cardjson = {}
#cardjson["id"] = hashlib.sha1(setname + card['name'] + str(card['name']).lower()).hexdigest()
cardjson["cmc"] = card['cmc']
cardjson["manaCost"] = card['cost']
cardjson["name"] = card['name']
cardjson["number"] = cardnumber
# not sure if mtgjson has a list of acceptable rarities, but my application does
# so we'll warn me but continue to write a non-standard rarity (timeshifted?)
# may force 'special' in the future
if card['rarity'] not in ['Mythic Rare', 'Rare', 'Uncommon', 'Common', 'Special', 'Basic Land']:
#errors.append({"name": card['name'], "key": "rarity", "value": card['rarity']})
print card['name'] + ' has rarity = ' + card['rarity']
if subtypes:
cardjson['subtypes'] = subtypes
cardjson["rarity"] = card['rarity']
cardjson["text"] = card['rules']
cardjson["type"] = card['type']
workingtypes = card['type']
if ' - ' in workingtypes:
workingtypes = card['type'].split(' - ')[0]
cardjson['types'] = workingtypes.replace('Legendary ', '').replace('Snow ', '')\
.replace('Elite ', '').replace('Basic ', '').replace('World ', '').replace('Ongoing ', '')\
.strip().split(' ')
cardjson["url"] = card['img']
# optional fields
if len(card['colorIdentityArray']) > 0:
cardjson["colorIdentity"] = card['colorIdentityArray']
if len(card['colorArray']) > 0:
cardjson["colors"] = card['colorArray']
if len(cardnames) > 1:
cardjson["names"] = cardnames
if cardpower or cardpower == '0':
cardjson["power"] = cardpower
cardjson["toughness"] = cardtoughness
if card.has_key('loyalty'):
cardjson["loyalty"] = card['loyalty']
if card.has_key('layout'):
cardjson["layout"] = card['layout']
cardarray.append(cardjson)
return [{"cards": cardarray}, split_cards]
def scrape_mtgs_images(url='http://www.mtgsalvation.com/spoilers/183-hour-of-devastation', mtgscardurl='http://www.mtgsalvation.com/cards/hour-of-devastation/', exemptlist=[]):
page = requests.get(url)
tree = html.fromstring(page.content)
cards = {}
cardstree = tree.xpath('//*[contains(@class, "log-card")]')
for child in cardstree:
if child.text in exemptlist:
continue
childurl = mtgscardurl + child.attrib['data-card-id'] + '-' + child.text.replace(
' ', '-').replace("'", "").replace(',', '').replace('-//', '')
cardpage = requests.get(childurl)
tree = html.fromstring(cardpage.content)
cardtree = tree.xpath('//img[contains(@class, "card-spoiler-image")]')
try:
cardurl = cardtree[0].attrib['src']
except:
cardurl = ''
pass
cards[child.text] = {
"url": cardurl
}
time.sleep(.2)
return cards

87
mythic_scraper.py Normal file
View File

@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
import requests
import time
from bs4 import BeautifulSoup as BS
from bs4 import Comment
# mtgjson is optional, will ignore cards found if passed
def get_mythic_cards(url='http://mythicspoiler.com/ixa/', mtgjson=False):
cards = {'cards': []}
r = requests.get(url)
soup = BS(r.text, "html.parser")
cardurls = soup.find_all('a', 'card')
urllist = []
for cardurl in cardurls:
try:
urllist.append(url + str(cardurl).split("href=\"")
[1].split('"><img')[0])
except:
pass
if not mtgjson:
for url in urllist:
card = scrape_mythic_card_page(url)
if card != '' and 'name' in card and card['name'] != '':
cards['cards'].append(scrape_mythic_card_page(url))
time.sleep(.5)
else:
for url in urllist:
needsScraped = True
for card in mtgjson['cards']:
if card['name'].lower().replace(' ', '') in url:
needsScraped = False
if needsScraped:
card = scrape_mythic_card_page(url)
if card != '' and 'name' in card and card['name'] != '':
mtgjson['cards'].append(card)
cards = mtgjson
return cards
def scrape_mythic_card_page(url):
r = requests.get(url)
soup = BS(r.text, "html.parser")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
card = {}
for comment in comments:
if comment == 'CARD NAME':
card['name'] = comment.next_element.strip().replace('"', '')
elif comment == 'MANA COST':
try:
card['manaCost'] = comment.next_element.strip().replace('"', '')
except:
pass
elif comment == 'TYPE':
card['type'] = comment.next_element.strip().replace('"', '')
elif comment == 'CARD TEXT':
buildText = ''
for element in comment.next_elements:
try:
if not element.strip() in ['CARD TEXT', 'FLAVOR TEXT', '']:
if buildText != '':
buildText += '\n'
buildText += element.strip()
if element.strip() == 'FLAVOR TEXT':
card['text'] = buildText
break
except:
pass
elif comment == 'Set Number':
try:
card['number'] = comment.next_element.strip()
except:
pass
elif comment == 'P/T':
try:
if comment.next_element.strip().split('/')[0] != '':
card['power'] = comment.next_element.strip().split('/')[0]
card['toughness'] = comment.next_element.strip().split('/')[1]
except:
pass
return card

170
scryfall_scraper.py Normal file
View File

@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
import requests
import time
def get_scryfall(setUrl):
#getUrl = 'https://api.scryfall.com/cards/search?q=++e:'
#setUrl = getUrl + setname.lower()
setDone = False
scryfall = []
while setDone == False:
setcards = requests.get(setUrl)
setcards = setcards.json()
if setcards.has_key('data'):
scryfall.append(setcards['data'])
else:
setDone = True
# print setUrl
# print setcards
print 'No Scryfall data'
scryfall = ['']
time.sleep(.1)
if setcards.has_key('has_more'):
if setcards['has_more'] == True:
setUrl = setcards['next_page']
else:
setDone = True
else:
setDone = True
if not scryfall[0] == '':
scryfall = convert_scryfall(scryfall[0])
return {'cards': scryfall}
else:
return {'cards': []}
def convert_scryfall(scryfall):
cards2 = []
for card in scryfall:
card2 = {}
card2['cmc'] = int(card['cmc'])
if card.has_key('mana_cost'):
card2['manaCost'] = card['mana_cost'].replace(
'{', '').replace('}', '')
else:
card2['manaCost'] = ''
card2['name'] = card['name']
card2['number'] = card['collector_number']
card2['rarity'] = card['rarity'].replace(
'mythic', 'mythic rare').title()
if card.has_key('oracle_text'):
card2['text'] = card['oracle_text'].replace(
u"\u2014", '-').replace(u"\u2212", "-")
else:
card2['text'] = ''
card2['url'] = card['image_uri']
if not 'type_line' in card:
card['type_line'] = 'Unknown'
card2['type'] = card['type_line'].replace(u'', '-')
cardtypes = card['type_line'].split(u'')[0].replace('Legendary ', '').replace('Snow ', '')\
.replace('Elite ', '').replace('Basic ', '').replace('World ', '').replace('Ongoing ', '')
cardtypes = cardtypes.split(' ')
if u'' in card['type_line']:
cardsubtypes = card['type_line'].split(u'')[1]
if ' ' in cardsubtypes:
card2['subtypes'] = cardsubtypes.split(' ')
else:
card2['subtypes'] = [cardsubtypes]
if 'Legendary' in card['type_line']:
if card2.has_key('supertypes'):
card2['supertypes'].append('Legendary')
else:
card2['supertypes'] = ['Legendary']
if 'Snow' in card['type_line']:
if card2.has_key('supertypes'):
card2['supertypes'].append('Snow')
else:
card2['supertypes'] = ['Snow']
if 'Elite' in card['type_line']:
if card2.has_key('supertypes'):
card2['supertypes'].append('Elite')
else:
card2['supertypes'] = ['Elite']
if 'Basic' in card['type_line']:
if card2.has_key('supertypes'):
card2['supertypes'].append('Basic')
else:
card2['supertypes'] = ['Basic']
if 'World' in card['type_line']:
if card2.has_key('supertypes'):
card2['supertypes'].append('World')
else:
card2['supertypes'] = ['World']
if 'Ongoing' in card['type_line']:
if card2.has_key('supertypes'):
card2['supertypes'].append('Ongoing')
else:
card2['supertypes'] = ['Ongoing']
card2['types'] = cardtypes
if card.has_key('color_identity'):
card2['colorIdentity'] = card['color_identity']
if card.has_key('colors'):
if not card['colors'] == []:
card2['colors'] = []
if 'W' in card['colors']:
card2['colors'].append("White")
if 'U' in card['colors']:
card2['colors'].append("Blue")
if 'B' in card['colors']:
card2['colors'].append("Black")
if 'R' in card['colors']:
card2['colors'].append("Red")
if 'G' in card['colors']:
card2['colors'].append("Green")
#card2['colors'] = card['colors']
if card.has_key('all_parts'):
card2['names'] = []
for partname in card['all_parts']:
card2['names'].append(partname['name'])
if card.has_key('power'):
card2['power'] = card['power']
if card.has_key('toughness'):
card2['toughness'] = card['toughness']
if card.has_key('layout'):
if card['layout'] != 'normal':
card2['layout'] = card['layout']
if card.has_key('loyalty'):
card2['loyalty'] = card['loyalty']
if card.has_key('artist'):
card2['artist'] = card['artist']
# if card.has_key('source'):
# card2['source'] = card['source']
# if card.has_key('rulings'):
# card2['rulings'] = card['rulings']
if card.has_key('flavor_text'):
card2['flavor'] = card['flavor_text']
if card.has_key('multiverse_id'):
card2['multiverseid'] = card['multiverse_id']
cards2.append(card2)
return cards2
def smash_mtgs_scryfall(mtgs, scryfall):
for mtgscard in mtgs['cards']:
cardFound = False
for scryfallcard in scryfall['cards']:
if scryfallcard['name'] == mtgscard['name']:
for key in scryfallcard:
if key in mtgscard:
if not mtgscard[key] == scryfallcard[key]:
try:
print "%s's key %s\nMTGS : %s\nScryfall: %s" % (mtgscard['name'], key, mtgscard[key], scryfallcard[key])
except:
print "Error printing Scryfall vs MTGS debug info for " + mtgscard['name']
pass
cardFound = True
if not cardFound:
print "MTGS has card %s and Scryfall does not." % mtgscard['name']
for scryfallcard in scryfall['cards']:
cardFound = False
for mtgscard in mtgs['cards']:
if scryfallcard['name'] == mtgscard['name']:
cardFound = True
if not cardFound:
print "Scryfall has card %s and MTGS does not." % scryfallcard['name']
return mtgs

File diff suppressed because it is too large Load Diff

252
wizards_scraper.py Normal file
View File

@ -0,0 +1,252 @@
# -*- coding: utf-8 -*-
import requests
from lxml import html
from PIL import Image
def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"setname": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[], split_cards=[]):
if 'setlongname' in setinfo:
url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['setlongname'].lower().replace('of', '').replace(
' ', ' ').replace(' ', '-')
page = requests.get(url)
tree = html.fromstring(page.content)
cards = []
cardtree = tree.xpath('//*[@id="content-detail-page-of-an-article"]')
for child in cardtree:
cardElements = child.xpath('//*/p/img')
cardcount = 0
for cardElement in cardElements:
card = {
"name": cardElement.attrib['alt'].replace(u"\u2019", '\'').split(' /// ')[0],
"img": cardElement.attrib['src']
}
card["url"] = card["img"]
#card["cmc"] = 0
#card["manaCost"] = ""
#card["type"] = ""
#card["types"] = []
#card["text"] = ""
#card["colorIdentity"] = [""]
# if card['name'] in split_cards:
# card["names"] = [card['name'], split_cards[card['name']]]
# card["layout"] = "split"
#notSplit = True
# for backsplit in split_cards:
# if card['name'] == split_cards[backsplit]:
# notSplit = False
# if not card['name'] in delete_cards:
cards.append(card)
cardcount += 1
fullspoil = {"cards": cards}
print "Spoil Gallery has " + str(cardcount) + " cards."
download_images(fullspoil['cards'], setinfo['setname'])
fullspoil = get_rarities_by_symbol(fullspoil, setinfo['setname'])
fullspoil = get_mana_symbols(fullspoil, setinfo['setname'])
#fullspoil = get_colors_by_frame(fullspoil, setinfo['setname'])
return fullspoil
def get_rarities_by_symbol(fullspoil, setcode, split_cards=[]):
symbolPixels = (240, 219, 242, 221)
highVariance = 15
colorAverages = {
"Common": [30, 27, 28],
"Uncommon": [121, 155, 169],
"Rare": [166, 143, 80],
"Mythic Rare": [201, 85, 14]
}
symbolCount = 0
for card in fullspoil['cards']:
try:
cardImage = Image.open(
'images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg')
except:
continue
pass
if '//' in card['name']:
setSymbol = cardImage.crop((240, 138, 242, 140))
else:
setSymbol = cardImage.crop(symbolPixels)
cardHistogram = setSymbol.histogram()
reds = cardHistogram[0:256]
greens = cardHistogram[256:256 * 2]
blues = cardHistogram[256 * 2: 256 * 3]
reds = sum(i * w for i, w in enumerate(reds)) / sum(reds)
greens = sum(i * w for i, w in enumerate(greens)) / sum(greens)
blues = sum(i * w for i, w in enumerate(blues)) / sum(blues)
variance = 768
for color in colorAverages:
colorVariance = 0
colorVariance = colorVariance + \
abs(colorAverages[color][0] - reds)
colorVariance = colorVariance + \
abs(colorAverages[color][1] - greens)
colorVariance = colorVariance + \
abs(colorAverages[color][2] - blues)
if colorVariance < variance:
variance = colorVariance
card['rarity'] = color
if variance > highVariance:
# if a card isn't close to any of the colors, it's probably a planeswalker? make it mythic.
print card['name'], 'has high variance of', variance, ', closest rarity is', card['rarity']
card['rarity'] = "Mythic Rare"
# print card['name'], '$', reds, greens, blues
if symbolCount < 10:
setSymbol.save(
'images/' + card['name'].replace(' // ', '') + '.symbol.jpg')
symbolCount += 1
return fullspoil
def get_colors_by_frame(fullspoil, setcode, split_cards={}):
framePixels = (20, 11, 76, 16)
highVariance = 10
colorAverages = {
"White": [231, 225, 200],
"Blue": [103, 193, 230],
"Black": [58, 61, 54],
"Red": [221, 122, 101],
"Green": [118, 165, 131],
"Multicolor": [219, 200, 138],
"Artifact": [141, 165, 173],
"Colorless": [216, 197, 176],
}
symbolCount = 0
for card in fullspoil['cards']:
try:
cardImage = Image.open(
'images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg')
except:
continue
pass
cardColor = cardImage.crop(framePixels)
cardHistogram = cardColor.histogram()
reds = cardHistogram[0:256]
greens = cardHistogram[256:256 * 2]
blues = cardHistogram[256 * 2: 256 * 3]
reds = sum(i * w for i, w in enumerate(reds)) / sum(reds)
greens = sum(i * w for i, w in enumerate(greens)) / sum(greens)
blues = sum(i * w for i, w in enumerate(blues)) / sum(blues)
variance = 768
for color in colorAverages:
colorVariance = 0
colorVariance = colorVariance + \
abs(colorAverages[color][0] - reds)
colorVariance = colorVariance + \
abs(colorAverages[color][1] - greens)
colorVariance = colorVariance + \
abs(colorAverages[color][2] - blues)
if colorVariance < variance:
variance = colorVariance
card['colors'] = [color]
return fullspoil
def get_mana_symbols(fullspoil={}, setcode="HOU", split_cards=[]):
manaBoxes = [(234, 23, 244, 33), (220, 23, 230, 33),
(206, 23, 216, 33), (192, 23, 202, 33), (178, 23, 188, 33)]
highVariance = 0
colorAverages = {
"W": [126, 123, 110],
"U": [115, 140, 151],
"B": [105, 99, 98],
"R": [120, 89, 77],
"G": [65, 78, 69],
"1": [162, 156, 154],
"2": [155, 148, 147],
"3": [160, 153, 152],
"4": [149, 143, 141],
"5": [155, 149, 147],
"6": [151, 145, 143],
"7": [169, 163, 161],
"X": [160, 154, 152]
}
for card in fullspoil['cards']:
try:
cardImage = Image.open(
'images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg')
except:
continue
pass
card['manaCost'] = ""
for manaBox in manaBoxes:
manaSymbol = cardImage.crop(manaBox)
cardHistogram = manaSymbol.histogram()
reds = cardHistogram[0:256]
greens = cardHistogram[256:256 * 2]
blues = cardHistogram[256 * 2: 256 * 3]
reds = sum(i * w for i, w in enumerate(reds)) / sum(reds)
greens = sum(i * w for i, w in enumerate(greens)) / sum(greens)
blues = sum(i * w for i, w in enumerate(blues)) / sum(blues)
variance = 768
for color in colorAverages:
colorVariance = 0
colorVariance = colorVariance + \
abs(colorAverages[color][0] - reds)
colorVariance = colorVariance + \
abs(colorAverages[color][1] - greens)
colorVariance = colorVariance + \
abs(colorAverages[color][2] - blues)
if colorVariance < variance:
variance = colorVariance
closestColor = color
if variance < 10:
# if card['name'] in ["Mirage Mirror", "Uncage the Menagerie", "Torment of Hailfire"]:
# print card['name'] + " " + str(reds) + " " + str(greens) + " " + str(blues)
if closestColor in ["2", "5"]:
twoVSfive = (
manaBox[0] + 1, manaBox[1] + 4, manaBox[2] - 5, manaBox[3] - 2)
manaSymbol = cardImage.crop(twoVSfive)
cardHistogram = manaSymbol.histogram()
reds = cardHistogram[0:256]
greens = cardHistogram[256:256 * 2]
blues = cardHistogram[256 * 2: 256 * 3]
reds = sum(
i * w for i, w in enumerate(reds)) / sum(reds)
greens = sum(
i * w for i, w in enumerate(greens)) / sum(greens)
blues = sum(
i * w for i, w in enumerate(blues)) / sum(blues)
variance = 768
colorVariance = 0
colorVariance = colorVariance + abs(175 - reds)
colorVariance = colorVariance + abs(168 - greens)
colorVariance = colorVariance + abs(166 - blues)
if colorVariance < 10:
closestColor = "2"
elif colorVariance > 110 and colorVariance < 120:
closestColor = "5"
else:
continue
card['manaCost'] = closestColor + card['manaCost']
return fullspoil
def smash_fullspoil(mtgjson, fullspoil):
different_keys = {}
for mtgjson_card in mtgjson['cards']:
for fullspoil_card in fullspoil['cards']:
if mtgjson_card['name'] == fullspoil_card['name']:
for key in fullspoil_card:
if key in mtgjson_card:
if mtgjson_card[key] != fullspoil_card[key] and key != 'colors':
if not fullspoil_card['name'] in different_keys:
different_keys[fullspoil_card['name']] = {
key: fullspoil_card[key]}
else:
different_keys[fullspoil_card['name']
][key] = fullspoil_card[key]
for fullspoil_card in fullspoil['cards']:
WOTC_only = []
match = False
for mtgjson_card in mtgjson['cards']:
if mtgjson_card['name'] == fullspoil_card['name']:
match = True
if not match:
WOTC_only.append(fullspoil_card['name'])
if len(WOTC_only) > 0:
print "WOTC only cards: "
print WOTC_only
print different_keys