Refactor set_info, download_images to scraper sub

Refactor set_info to align with mtgjson keys.

Move download_images to wizards_scraper
This commit is contained in:
tritoch 2017-07-06 19:46:26 -05:00 committed by GitHub
parent 599aaee733
commit dc9b9b7a48
6 changed files with 94 additions and 93 deletions

30
main.py
View File

@ -58,7 +58,7 @@ def save_allsets(AllSets):
def save_masterpieces(masterpieces, setinfo):
with open('out/' + setinfo['masterpieces']['setname'] + '.json', 'w') as outfile:
with open('out/' + setinfo['masterpieces']['code'] + '.json', 'w') as outfile:
json.dump(masterpieces, outfile, sort_keys=True,
indent=2, separators=(',', ': '))
@ -90,8 +90,8 @@ if __name__ == '__main__':
AllSets = spoilers.get_allsets() # get AllSets from mtgjson
combinedjson = {}
for setinfo in setinfos:
if setinfo['setname'] in AllSets:
print "Found set from set_info.yml " +setinfo['setname']+ " in MTGJSON, not adding it"
if setinfo['code'] in AllSets:
print "Found set from set_info.yml " +setinfo['code']+ " in MTGJSON, not adding it"
continue
if presets['oldRSS'] or 'noRSS' in setinfo and setinfo['noRSS']:
mtgs = {"cards": []}
@ -101,12 +101,12 @@ if __name__ == '__main__':
[mtgs, split_cards] = mtgs_scraper.parse_mtgs(
mtgs, [], [], [], presets['split_cards']) # parse spoilers into mtgjson format
mtgs = spoilers.correct_cards(
mtgs, manual_sets[setinfo['setname']], card_corrections, delete_cards['delete']) # fix using the fixfiles
mtgs, manual_sets[setinfo['code']], card_corrections, delete_cards['delete']) # fix using the fixfiles
mtgjson = spoilers.get_image_urls(
mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo) # get images
mtgs, presets['isfullspoil'], setinfo['code'], setinfo['name'], setinfo['size'], setinfo) # get images
if presets['scryfallComparison']:
scryfall = scryfall_scraper.get_scryfall(
'https://api.scryfall.com/cards/search?q=++e:' + setinfo['setname'].lower())
'https://api.scryfall.com/cards/search?q=++e:' + setinfo['code'].lower())
mtgjson = scryfall_scraper.smash_mtgs_scryfall(mtgs, scryfall)
if 'fullSpoil' in setinfo and setinfo['fullSpoil']:
wotc = wizards_scraper.scrape_fullspoil('', setinfo)
@ -115,10 +115,10 @@ if __name__ == '__main__':
mtgjson, card_corrections) # check for errors where possible
errorlog += errors
spoilers.write_xml(
mtgjson, setinfo['setname'], setinfo['setlongname'], setinfo['setreleasedate'])
#save_xml(spoilers.pretty_xml(setinfo['setname']), 'out/spoiler.xml')
mtgjson, setinfo['code'], setinfo['name'], setinfo['releaseDate'])
#save_xml(spoilers.pretty_xml(setinfo['code']), 'out/spoiler.xml')
mtgjson = spoilers.add_headers(mtgjson, setinfo)
AllSets = spoilers.make_allsets(AllSets, mtgjson, setinfo['setname'])
AllSets = spoilers.make_allsets(AllSets, mtgjson, setinfo['code'])
if 'masterpieces' in setinfo: # repeat all of the above for masterpieces
# masterpieces aren't in the rss feed, so for the new cards, we'll go to their individual pages on mtgs
# old cards will get their infos copied from mtgjson (including fields that may not apply like 'artist')
@ -127,14 +127,14 @@ if __name__ == '__main__':
setinfo['masterpieces'], AllSets, mtgjson)
[masterpieces, errors] = spoilers.error_check(masterpieces)
errorlog += errors
spoilers.write_xml(masterpieces, setinfo['masterpieces']['setname'],
setinfo['masterpieces']['setlongname'], setinfo['masterpieces']['setreleasedate'])
spoilers.write_xml(masterpieces, setinfo['masterpieces']['code'],
setinfo['masterpieces']['name'], setinfo['masterpieces']['releaseDate'])
AllSets = spoilers.make_allsets(
AllSets, masterpieces, setinfo['masterpieces']['setname'])
AllSets, masterpieces, setinfo['masterpieces']['code'])
save_masterpieces(masterpieces, setinfo)
combinedjson[setinfo['masterpieces']['setname']] = masterpieces
save_setjson(mtgjson, setinfo['setname'])
combinedjson[setinfo['setname']] = mtgjson
combinedjson[setinfo['masterpieces']['code']] = masterpieces
save_setjson(mtgjson, setinfo['code'])
combinedjson[setinfo['code']] = mtgjson
save_setjson(combinedjson, 'spoiler')
spoilers.write_combined_xml(combinedjson, setinfos)
save_xml(spoilers.pretty_xml('out/spoiler.xml'), 'out/spoiler.xml')

View File

@ -199,7 +199,7 @@ def parse_mtgs(mtgs, manual_cards=[], card_corrections=[], delete_cards=[], spli
if card['cmc'] == '':
card['cmc'] = 0
cardjson = {}
#cardjson["id"] = hashlib.sha1(setname + card['name'] + str(card['name']).lower()).hexdigest()
#cardjson["id"] = hashlib.sha1(code + card['name'] + str(card['name']).lower()).hexdigest()
cardjson["cmc"] = card['cmc']
cardjson["manaCost"] = card['cost']
cardjson["name"] = card['name']

View File

@ -5,7 +5,7 @@ import time
def get_scryfall(setUrl):
#getUrl = 'https://api.scryfall.com/cards/search?q=++e:'
#setUrl = getUrl + setname.lower()
#setUrl = getUrl + code.lower()
setDone = False
scryfall = []

View File

@ -2,21 +2,21 @@
#
# required keys
#
# setname: FSN
# setlongname: "Full Set Name"
# setsize: 274
# setreleasedate: "2050-02-28"
# settype: expansion
# code: FSN
# name: "Full Set Name"
# size: 274
# releaseDate: "2050-02-28"
# type: expansion
#
# optional keys
#
# blockname: "Block Name"
# block: "Block Name"
# mtgsurl: "http://url_to_mtgsalvation.com/spoilers/page
# mtgscardpath "http://url_to_mtgsalvation.com/cards/setpage/"
# fullSpoil: false
# masterpieces:
#
# Masterpieces contain setname, setlongname, setreleasedate as above
# Masterpieces contain code, name, releaseDate as above
# and requires mtgsurl and mtgscardpath
# also can contain
#
@ -24,37 +24,37 @@
#
---
setname: "HOU"
setlongname: "Hour of Devastation"
blockname: "Amonkhet"
setsize: 199
setreleasedate: "2017-07-14"
settype: "expansion"
code: "HOU"
name: "Hour of Devastation"
block: "Amonkhet"
size: 199
releaseDate: "2017-07-14"
type: "expansion"
mtgsurl: "http://www.mtgsalvation.com/spoilers/183-hour-of-devastation"
mtgscardpath: "http://www.mtgsalvation.com/cards/hour-of-devastation/"
fullSpoil: false
masterpieces:
setname: "MPS_AKH"
setlongname: "Masterpiece Series: Amonkhet Invocations"
setreleasedate: "2017-04-28"
code: "MPS_AKH"
name: "Masterpiece Series: Amonkhet Invocations"
releaseDate: "2017-04-28"
alternativeNames: ["Amonkhet Invocations"]
galleryURL: "http://magic.wizards.com/en/articles/archive/feature/masterpiece-series-hour-devastation-invocations-2017-06-19"
additionalCardNames: []
mtgsurl: "http://www.mtgsalvation.com/spoilers/181-amonkhet-invocations"
mtgscardpath: "http://www.mtgsalvation.com/cards/amonkhet-invocations/"
---
setname: "XLN"
setlongname: "Ixalan"
blockname: "Ixalan"
setsize: 279
setreleasedate: "2017-09-29"
settype: "expansion"
code: "XLN"
name: "Ixalan"
block: "Ixalan"
size: 279
releaseDate: "2017-09-29"
type: "expansion"
noRSS: true
---
setname: "C17"
setlongname: "Commander 2017"
setsize: 309
setreleasedate: "2017-09-29"
settype: "commander"
code: "C17"
name: "Commander 2017"
size: 309
releaseDate: "2017-09-29"
type: "commander"
noRSS: true
noBooster: true

View File

@ -283,35 +283,19 @@ def remove_corrected_errors(errorlog=[], card_corrections=[], print_fixed=False)
return errorlog2
def download_images(mtgjson, setcode):
if not os.path.isdir('images/' + setcode):
os.makedirs('images/' + setcode)
if 'cards' in mtgjson:
jsoncards = mtgjson['cards']
else:
jsoncards = mtgjson
for card in jsoncards:
if card['url']:
if os.path.isfile('images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg'):
continue
# print 'Downloading ' + card['url'] + ' to images/' + setcode + '/' + card['name'].replace(' // ','') + '.jpg'
requests.get(card['url'], 'images/' + setcode +
'/' + card['name'].replace(' // ', '') + '.jpg')
def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269, setinfo=False):
def get_image_urls(mtgjson, isfullspoil, code, name, size=269, setinfo=False):
IMAGES = 'http://magic.wizards.com/en/content/' + \
setlongname.lower().replace(' ', '-') + '-cards'
name.lower().replace(' ', '-') + '-cards'
IMAGES2 = 'http://mythicspoiler.com/newspoilers.html'
IMAGES3 = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + \
setlongname.lower().replace('of', '').replace(' ', ' ').replace(' ', '-')
name.lower().replace('of', '').replace(' ', ' ').replace(' ', '-')
text = requests.get(IMAGES).text
text2 = requests.get(IMAGES2).text
text3 = requests.get(IMAGES3).text
wotcpattern = r'<img alt="{}.*?" src="(?P<img>.*?\.png)"'
wotcpattern2 = r'<img src="(?P<img>.*?\.png).*?alt="{}.*?"'
mythicspoilerpattern = r' src="' + setname.lower() + '/cards/{}.*?.jpg">'
mythicspoilerpattern = r' src="' + code.lower() + '/cards/{}.*?.jpg">'
WOTC = []
for c in mtgjson['cards']:
if 'names' in c:
@ -356,10 +340,10 @@ def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269, seti
return mtgjson
def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]):
def write_xml(mtgjson, code, name, releaseDate, split_cards=[]):
if not os.path.isdir('out/'):
os.makedirs('out/')
cardsxml = open('out/' + setname + '.xml', 'w+')
cardsxml = open('out/' + code + '.xml', 'w+')
cardsxml.truncate()
count = 0
dfccount = 0
@ -368,14 +352,14 @@ def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]):
cardsxml.write("<?xml version='1.0' encoding='UTF-8'?>\n"
"<cockatrice_carddatabase version='3'>\n"
"<sets>\n<set>\n<name>"
+ setname +
+ code +
"</name>\n"
"<longname>"
+ setlongname +
+ name +
"</longname>\n"
"<settype>Expansion</settype>\n"
"<releasedate>"
+ setreleasedate +
+ releaseDate +
"</releasedate>\n"
"</set>\n"
"</sets>\n"
@ -445,7 +429,7 @@ def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]):
cardsxml.write("<card>\n")
cardsxml.write("<name>" + name.encode('utf-8') + "</name>\n")
cardsxml.write(
'<set rarity="' + card['rarity'] + '" picURL="' + card["url"] + '">' + setname + '</set>\n')
'<set rarity="' + card['rarity'] + '" picURL="' + card["url"] + '">' + code + '</set>\n')
cardsxml.write(
"<manacost>" + manacost.encode('utf-8') + "</manacost>\n")
cardsxml.write("<cmc>" + cardcmc + "</cmc>\n")
@ -479,7 +463,7 @@ def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]):
cardsxml.write("</cards>\n</cockatrice_carddatabase>")
print 'XML Stats for ' + setlongname
print 'XML Stats for ' + name
print 'Total cards: ' + str(count)
if dfccount > 0:
print 'DFC: ' + str(dfccount)
@ -641,8 +625,8 @@ def pretty_xml(infile):
return pretty_xml_as_string
def make_allsets(AllSets, mtgjson, setname):
AllSets[setname] = mtgjson
def make_allsets(AllSets, mtgjson, code):
AllSets[code] = mtgjson
return AllSets
@ -676,8 +660,8 @@ def make_masterpieces(headers, AllSets, spoil):
masterpieces2 = []
for masterpiece in masterpieces:
matched = False
if headers['setname'] in AllSets:
for oldMasterpiece in AllSets[headers['setname']]['cards']:
if headers['code'] in AllSets:
for oldMasterpiece in AllSets[headers['code']]['cards']:
if masterpiece['name'] == oldMasterpiece['name']:
matched = True
for set in AllSets:
@ -703,10 +687,10 @@ def make_masterpieces(headers, AllSets, spoil):
print "We couldn't find a card object to assign the data to for masterpiece " + masterpiece['name']
masterpieces2.append(masterpiece)
mpsjson = {
"name": headers['setlongname'],
"name": headers['name'],
"alternativeNames": headers['alternativeNames'],
"code": headers['setname'],
"releaseDate": headers['setreleasedate'],
"code": headers['code'],
"releaseDate": headers['releaseDate'],
"border": "black",
"type": "masterpiece",
"cards": masterpieces2
@ -715,11 +699,11 @@ def make_masterpieces(headers, AllSets, spoil):
def set_has_cards(setinfo, manual_cards, mtgjson):
if setinfo['setname'] in manual_cards or setinfo['setname'] in mtgjson:
if setinfo['code'] in manual_cards or setinfo['code'] in mtgjson:
return True
for card in manual_cards['cards']:
if set in card:
if set == setinfo['setname']:
if set == setinfo['code']:
return True
@ -733,10 +717,10 @@ def get_allsets():
def add_headers(mtgjson, setinfos):
mtgjson2 = {
"border": "black",
"code": setinfos['setname'],
"name": setinfos['setlongname'],
"releaseDate": setinfos['setreleasedate'],
"type": setinfos['settype'],
"code": setinfos['code'],
"name": setinfos['name'],
"releaseDate": setinfos['releaseDate'],
"type": setinfos['type'],
"cards": mtgjson['cards']
}
if not 'noBooster' in setinfos:
@ -761,6 +745,6 @@ def add_headers(mtgjson, setinfos):
"land",
"marketing"
],
if 'blockname' in setinfos:
mtgjson2['block'] = setinfos['blockname']
if 'block' in setinfos:
mtgjson2['block'] = setinfos['block']
return mtgjson2

View File

@ -2,11 +2,12 @@
import requests
from lxml import html
from PIL import Image
import os
def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"setname": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[], split_cards=[]):
if 'setlongname' in setinfo:
url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['setlongname'].lower().replace('of', '').replace(
def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"code": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[], split_cards=[]):
if 'name' in setinfo:
url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['name'].lower().replace('of', '').replace(
' ', ' ').replace(' ', '-')
page = requests.get(url)
tree = html.fromstring(page.content)
@ -40,10 +41,10 @@ def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-imag
cardcount += 1
fullspoil = {"cards": cards}
print "Spoil Gallery has " + str(cardcount) + " cards."
download_images(fullspoil['cards'], setinfo['setname'])
fullspoil = get_rarities_by_symbol(fullspoil, setinfo['setname'])
fullspoil = get_mana_symbols(fullspoil, setinfo['setname'])
#fullspoil = get_colors_by_frame(fullspoil, setinfo['setname'])
download_images(fullspoil['cards'], setinfo['code'])
fullspoil = get_rarities_by_symbol(fullspoil, setinfo['code'])
fullspoil = get_mana_symbols(fullspoil, setinfo['code'])
#fullspoil = get_colors_by_frame(fullspoil, setinfo['code'])
return fullspoil
@ -250,3 +251,19 @@ def smash_fullspoil(mtgjson, fullspoil):
print "WOTC only cards: "
print WOTC_only
print different_keys
def download_images(mtgjson, setcode):
if not os.path.isdir('images/' + setcode):
os.makedirs('images/' + setcode)
if 'cards' in mtgjson:
jsoncards = mtgjson['cards']
else:
jsoncards = mtgjson
for card in jsoncards:
if card['url']:
if os.path.isfile('images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg'):
continue
# print 'Downloading ' + card['url'] + ' to images/' + setcode + '/' + card['name'].replace(' // ','') + '.jpg'
requests.get(card['url'], 'images/' + setcode +
'/' + card['name'].replace(' // ', '') + '.jpg')