From dc9b9b7a48484933b3695db8a7cd97eeb357a993 Mon Sep 17 00:00:00 2001 From: tritoch Date: Thu, 6 Jul 2017 19:46:26 -0500 Subject: [PATCH] Refactor set_info, download_images to scraper sub Refactor set_info to align with mtgjson keys. Move download_images to wizards_scraper --- main.py | 30 ++++++++++---------- mtgs_scraper.py | 2 +- scryfall_scraper.py | 2 +- set_info.yml | 54 +++++++++++++++++------------------ spoilers.py | 68 +++++++++++++++++---------------------------- wizards_scraper.py | 31 ++++++++++++++++----- 6 files changed, 94 insertions(+), 93 deletions(-) diff --git a/main.py b/main.py index ebc352df..6159b874 100644 --- a/main.py +++ b/main.py @@ -58,7 +58,7 @@ def save_allsets(AllSets): def save_masterpieces(masterpieces, setinfo): - with open('out/' + setinfo['masterpieces']['setname'] + '.json', 'w') as outfile: + with open('out/' + setinfo['masterpieces']['code'] + '.json', 'w') as outfile: json.dump(masterpieces, outfile, sort_keys=True, indent=2, separators=(',', ': ')) @@ -90,8 +90,8 @@ if __name__ == '__main__': AllSets = spoilers.get_allsets() # get AllSets from mtgjson combinedjson = {} for setinfo in setinfos: - if setinfo['setname'] in AllSets: - print "Found set from set_info.yml " +setinfo['setname']+ " in MTGJSON, not adding it" + if setinfo['code'] in AllSets: + print "Found set from set_info.yml " +setinfo['code']+ " in MTGJSON, not adding it" continue if presets['oldRSS'] or 'noRSS' in setinfo and setinfo['noRSS']: mtgs = {"cards": []} @@ -101,12 +101,12 @@ if __name__ == '__main__': [mtgs, split_cards] = mtgs_scraper.parse_mtgs( mtgs, [], [], [], presets['split_cards']) # parse spoilers into mtgjson format mtgs = spoilers.correct_cards( - mtgs, manual_sets[setinfo['setname']], card_corrections, delete_cards['delete']) # fix using the fixfiles + mtgs, manual_sets[setinfo['code']], card_corrections, delete_cards['delete']) # fix using the fixfiles mtgjson = spoilers.get_image_urls( - mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo) # get images + mtgs, presets['isfullspoil'], setinfo['code'], setinfo['name'], setinfo['size'], setinfo) # get images if presets['scryfallComparison']: scryfall = scryfall_scraper.get_scryfall( - 'https://api.scryfall.com/cards/search?q=++e:' + setinfo['setname'].lower()) + 'https://api.scryfall.com/cards/search?q=++e:' + setinfo['code'].lower()) mtgjson = scryfall_scraper.smash_mtgs_scryfall(mtgs, scryfall) if 'fullSpoil' in setinfo and setinfo['fullSpoil']: wotc = wizards_scraper.scrape_fullspoil('', setinfo) @@ -115,10 +115,10 @@ if __name__ == '__main__': mtgjson, card_corrections) # check for errors where possible errorlog += errors spoilers.write_xml( - mtgjson, setinfo['setname'], setinfo['setlongname'], setinfo['setreleasedate']) - #save_xml(spoilers.pretty_xml(setinfo['setname']), 'out/spoiler.xml') + mtgjson, setinfo['code'], setinfo['name'], setinfo['releaseDate']) + #save_xml(spoilers.pretty_xml(setinfo['code']), 'out/spoiler.xml') mtgjson = spoilers.add_headers(mtgjson, setinfo) - AllSets = spoilers.make_allsets(AllSets, mtgjson, setinfo['setname']) + AllSets = spoilers.make_allsets(AllSets, mtgjson, setinfo['code']) if 'masterpieces' in setinfo: # repeat all of the above for masterpieces # masterpieces aren't in the rss feed, so for the new cards, we'll go to their individual pages on mtgs # old cards will get their infos copied from mtgjson (including fields that may not apply like 'artist') @@ -127,14 +127,14 @@ if __name__ == '__main__': setinfo['masterpieces'], AllSets, mtgjson) [masterpieces, errors] = spoilers.error_check(masterpieces) errorlog += errors - spoilers.write_xml(masterpieces, setinfo['masterpieces']['setname'], - setinfo['masterpieces']['setlongname'], setinfo['masterpieces']['setreleasedate']) + spoilers.write_xml(masterpieces, setinfo['masterpieces']['code'], + setinfo['masterpieces']['name'], setinfo['masterpieces']['releaseDate']) AllSets = spoilers.make_allsets( - AllSets, masterpieces, setinfo['masterpieces']['setname']) + AllSets, masterpieces, setinfo['masterpieces']['code']) save_masterpieces(masterpieces, setinfo) - combinedjson[setinfo['masterpieces']['setname']] = masterpieces - save_setjson(mtgjson, setinfo['setname']) - combinedjson[setinfo['setname']] = mtgjson + combinedjson[setinfo['masterpieces']['code']] = masterpieces + save_setjson(mtgjson, setinfo['code']) + combinedjson[setinfo['code']] = mtgjson save_setjson(combinedjson, 'spoiler') spoilers.write_combined_xml(combinedjson, setinfos) save_xml(spoilers.pretty_xml('out/spoiler.xml'), 'out/spoiler.xml') diff --git a/mtgs_scraper.py b/mtgs_scraper.py index d3e56ea0..61e2c862 100644 --- a/mtgs_scraper.py +++ b/mtgs_scraper.py @@ -199,7 +199,7 @@ def parse_mtgs(mtgs, manual_cards=[], card_corrections=[], delete_cards=[], spli if card['cmc'] == '': card['cmc'] = 0 cardjson = {} - #cardjson["id"] = hashlib.sha1(setname + card['name'] + str(card['name']).lower()).hexdigest() + #cardjson["id"] = hashlib.sha1(code + card['name'] + str(card['name']).lower()).hexdigest() cardjson["cmc"] = card['cmc'] cardjson["manaCost"] = card['cost'] cardjson["name"] = card['name'] diff --git a/scryfall_scraper.py b/scryfall_scraper.py index 036bb7df..70d18e68 100644 --- a/scryfall_scraper.py +++ b/scryfall_scraper.py @@ -5,7 +5,7 @@ import time def get_scryfall(setUrl): #getUrl = 'https://api.scryfall.com/cards/search?q=++e:' - #setUrl = getUrl + setname.lower() + #setUrl = getUrl + code.lower() setDone = False scryfall = [] diff --git a/set_info.yml b/set_info.yml index c03d0b88..64713afc 100644 --- a/set_info.yml +++ b/set_info.yml @@ -2,21 +2,21 @@ # # required keys # -# setname: FSN -# setlongname: "Full Set Name" -# setsize: 274 -# setreleasedate: "2050-02-28" -# settype: expansion +# code: FSN +# name: "Full Set Name" +# size: 274 +# releaseDate: "2050-02-28" +# type: expansion # # optional keys # -# blockname: "Block Name" +# block: "Block Name" # mtgsurl: "http://url_to_mtgsalvation.com/spoilers/page # mtgscardpath "http://url_to_mtgsalvation.com/cards/setpage/" # fullSpoil: false # masterpieces: # -# Masterpieces contain setname, setlongname, setreleasedate as above +# Masterpieces contain code, name, releaseDate as above # and requires mtgsurl and mtgscardpath # also can contain # @@ -24,37 +24,37 @@ # --- -setname: "HOU" -setlongname: "Hour of Devastation" -blockname: "Amonkhet" -setsize: 199 -setreleasedate: "2017-07-14" -settype: "expansion" +code: "HOU" +name: "Hour of Devastation" +block: "Amonkhet" +size: 199 +releaseDate: "2017-07-14" +type: "expansion" mtgsurl: "http://www.mtgsalvation.com/spoilers/183-hour-of-devastation" mtgscardpath: "http://www.mtgsalvation.com/cards/hour-of-devastation/" fullSpoil: false masterpieces: - setname: "MPS_AKH" - setlongname: "Masterpiece Series: Amonkhet Invocations" - setreleasedate: "2017-04-28" + code: "MPS_AKH" + name: "Masterpiece Series: Amonkhet Invocations" + releaseDate: "2017-04-28" alternativeNames: ["Amonkhet Invocations"] galleryURL: "http://magic.wizards.com/en/articles/archive/feature/masterpiece-series-hour-devastation-invocations-2017-06-19" additionalCardNames: [] mtgsurl: "http://www.mtgsalvation.com/spoilers/181-amonkhet-invocations" mtgscardpath: "http://www.mtgsalvation.com/cards/amonkhet-invocations/" --- -setname: "XLN" -setlongname: "Ixalan" -blockname: "Ixalan" -setsize: 279 -setreleasedate: "2017-09-29" -settype: "expansion" +code: "XLN" +name: "Ixalan" +block: "Ixalan" +size: 279 +releaseDate: "2017-09-29" +type: "expansion" noRSS: true --- -setname: "C17" -setlongname: "Commander 2017" -setsize: 309 -setreleasedate: "2017-09-29" -settype: "commander" +code: "C17" +name: "Commander 2017" +size: 309 +releaseDate: "2017-09-29" +type: "commander" noRSS: true noBooster: true \ No newline at end of file diff --git a/spoilers.py b/spoilers.py index 191e7201..32c6f3d5 100644 --- a/spoilers.py +++ b/spoilers.py @@ -283,35 +283,19 @@ def remove_corrected_errors(errorlog=[], card_corrections=[], print_fixed=False) return errorlog2 -def download_images(mtgjson, setcode): - if not os.path.isdir('images/' + setcode): - os.makedirs('images/' + setcode) - if 'cards' in mtgjson: - jsoncards = mtgjson['cards'] - else: - jsoncards = mtgjson - for card in jsoncards: - if card['url']: - if os.path.isfile('images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg'): - continue - # print 'Downloading ' + card['url'] + ' to images/' + setcode + '/' + card['name'].replace(' // ','') + '.jpg' - requests.get(card['url'], 'images/' + setcode + - '/' + card['name'].replace(' // ', '') + '.jpg') - - -def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269, setinfo=False): +def get_image_urls(mtgjson, isfullspoil, code, name, size=269, setinfo=False): IMAGES = 'http://magic.wizards.com/en/content/' + \ - setlongname.lower().replace(' ', '-') + '-cards' + name.lower().replace(' ', '-') + '-cards' IMAGES2 = 'http://mythicspoiler.com/newspoilers.html' IMAGES3 = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + \ - setlongname.lower().replace('of', '').replace(' ', ' ').replace(' ', '-') + name.lower().replace('of', '').replace(' ', ' ').replace(' ', '-') text = requests.get(IMAGES).text text2 = requests.get(IMAGES2).text text3 = requests.get(IMAGES3).text wotcpattern = r'{}.*?' + mythicspoilerpattern = r' src="' + code.lower() + '/cards/{}.*?.jpg">' WOTC = [] for c in mtgjson['cards']: if 'names' in c: @@ -356,10 +340,10 @@ def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269, seti return mtgjson -def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]): +def write_xml(mtgjson, code, name, releaseDate, split_cards=[]): if not os.path.isdir('out/'): os.makedirs('out/') - cardsxml = open('out/' + setname + '.xml', 'w+') + cardsxml = open('out/' + code + '.xml', 'w+') cardsxml.truncate() count = 0 dfccount = 0 @@ -368,14 +352,14 @@ def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]): cardsxml.write("\n" "\n" "\n\n" - + setname + + + code + "\n" "" - + setlongname + + + name + "\n" "Expansion\n" "" - + setreleasedate + + + releaseDate + "\n" "\n" "\n" @@ -445,7 +429,7 @@ def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]): cardsxml.write("\n") cardsxml.write("" + name.encode('utf-8') + "\n") cardsxml.write( - '' + setname + '\n') + '' + code + '\n') cardsxml.write( "" + manacost.encode('utf-8') + "\n") cardsxml.write("" + cardcmc + "\n") @@ -479,7 +463,7 @@ def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]): cardsxml.write("\n") - print 'XML Stats for ' + setlongname + print 'XML Stats for ' + name print 'Total cards: ' + str(count) if dfccount > 0: print 'DFC: ' + str(dfccount) @@ -641,8 +625,8 @@ def pretty_xml(infile): return pretty_xml_as_string -def make_allsets(AllSets, mtgjson, setname): - AllSets[setname] = mtgjson +def make_allsets(AllSets, mtgjson, code): + AllSets[code] = mtgjson return AllSets @@ -676,8 +660,8 @@ def make_masterpieces(headers, AllSets, spoil): masterpieces2 = [] for masterpiece in masterpieces: matched = False - if headers['setname'] in AllSets: - for oldMasterpiece in AllSets[headers['setname']]['cards']: + if headers['code'] in AllSets: + for oldMasterpiece in AllSets[headers['code']]['cards']: if masterpiece['name'] == oldMasterpiece['name']: matched = True for set in AllSets: @@ -703,10 +687,10 @@ def make_masterpieces(headers, AllSets, spoil): print "We couldn't find a card object to assign the data to for masterpiece " + masterpiece['name'] masterpieces2.append(masterpiece) mpsjson = { - "name": headers['setlongname'], + "name": headers['name'], "alternativeNames": headers['alternativeNames'], - "code": headers['setname'], - "releaseDate": headers['setreleasedate'], + "code": headers['code'], + "releaseDate": headers['releaseDate'], "border": "black", "type": "masterpiece", "cards": masterpieces2 @@ -715,11 +699,11 @@ def make_masterpieces(headers, AllSets, spoil): def set_has_cards(setinfo, manual_cards, mtgjson): - if setinfo['setname'] in manual_cards or setinfo['setname'] in mtgjson: + if setinfo['code'] in manual_cards or setinfo['code'] in mtgjson: return True for card in manual_cards['cards']: if set in card: - if set == setinfo['setname']: + if set == setinfo['code']: return True @@ -733,10 +717,10 @@ def get_allsets(): def add_headers(mtgjson, setinfos): mtgjson2 = { "border": "black", - "code": setinfos['setname'], - "name": setinfos['setlongname'], - "releaseDate": setinfos['setreleasedate'], - "type": setinfos['settype'], + "code": setinfos['code'], + "name": setinfos['name'], + "releaseDate": setinfos['releaseDate'], + "type": setinfos['type'], "cards": mtgjson['cards'] } if not 'noBooster' in setinfos: @@ -761,6 +745,6 @@ def add_headers(mtgjson, setinfos): "land", "marketing" ], - if 'blockname' in setinfos: - mtgjson2['block'] = setinfos['blockname'] + if 'block' in setinfos: + mtgjson2['block'] = setinfos['block'] return mtgjson2 diff --git a/wizards_scraper.py b/wizards_scraper.py index 73737197..a2c56957 100644 --- a/wizards_scraper.py +++ b/wizards_scraper.py @@ -2,11 +2,12 @@ import requests from lxml import html from PIL import Image +import os -def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"setname": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[], split_cards=[]): - if 'setlongname' in setinfo: - url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['setlongname'].lower().replace('of', '').replace( +def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"code": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[], split_cards=[]): + if 'name' in setinfo: + url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['name'].lower().replace('of', '').replace( ' ', ' ').replace(' ', '-') page = requests.get(url) tree = html.fromstring(page.content) @@ -40,10 +41,10 @@ def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-imag cardcount += 1 fullspoil = {"cards": cards} print "Spoil Gallery has " + str(cardcount) + " cards." - download_images(fullspoil['cards'], setinfo['setname']) - fullspoil = get_rarities_by_symbol(fullspoil, setinfo['setname']) - fullspoil = get_mana_symbols(fullspoil, setinfo['setname']) - #fullspoil = get_colors_by_frame(fullspoil, setinfo['setname']) + download_images(fullspoil['cards'], setinfo['code']) + fullspoil = get_rarities_by_symbol(fullspoil, setinfo['code']) + fullspoil = get_mana_symbols(fullspoil, setinfo['code']) + #fullspoil = get_colors_by_frame(fullspoil, setinfo['code']) return fullspoil @@ -250,3 +251,19 @@ def smash_fullspoil(mtgjson, fullspoil): print "WOTC only cards: " print WOTC_only print different_keys + + +def download_images(mtgjson, setcode): + if not os.path.isdir('images/' + setcode): + os.makedirs('images/' + setcode) + if 'cards' in mtgjson: + jsoncards = mtgjson['cards'] + else: + jsoncards = mtgjson + for card in jsoncards: + if card['url']: + if os.path.isfile('images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg'): + continue + # print 'Downloading ' + card['url'] + ' to images/' + setcode + '/' + card['name'].replace(' // ','') + '.jpg' + requests.get(card['url'], 'images/' + setcode + + '/' + card['name'].replace(' // ', '') + '.jpg')