First pass refactoring scrapers into separate modules (#98)

Splits off the respective scrapers into submodules (mtgs_scraper.py, scryfall_scraper.py, mythic_scraper.py, wizards_scraper.py)
2026-03-21 17:54:59 -05:00 · 2017-07-05 21:44:45 -04:00 · 2017-07-05 21:44:45 -04:00 · 1dd538d5a1
commit 1dd538d5a1
parent 2af17727a4
6 changed files with 1002 additions and 855 deletions
--- a/main.py
+++ b/main.py
@ -1,5 +1,9 @@
 # -*- coding: utf-8 -*-
 import spoilers
+import mtgs_scraper
+import scryfall_scraper
+import mythic_scraper
+import wizards_scraper
 import os
 import commentjson
 import json
@ -7,17 +11,20 @@ import io
 import sys

 presets = {
-    "isfullspoil": False, # when full spoil comes around, we only want to use WOTC images
-    "includeMasterpieces": True, # if the set has masterpieces, let's get those too
-    "oldRSS": False, # maybe MTGS hasn't updated their spoiler.rss but new cards have leaked
+    "isfullspoil": False,  # when full spoil comes around, we only want to use WOTC images
+    "includeMasterpieces": True,  # if the set has masterpieces, let's get those too
+    "oldRSS": False,  # maybe MTGS hasn't updated their spoiler.rss but new cards have leaked
    "split_cards": {
    },
-    "siteorder": ['scryfall','mtgs','mythicspoiler'], # if we want to use one site before another for card data TODO
-    "imageorder": ['wotc','scryfall','mtgs','mythicspoiler'], # prioritize images from certain sources
-    "useexclusively": '', # if we *only* want to use one site TODO
-    "dumpXML": False, # let travis print XML for testing
-    "scryfallComparison": False, #if we want to debug compare scryfall to other sources, enable
-    "dumpErrors": True # print the error log from out/errors.json
+    # if we want to use one site before another for card data TODO
+    "siteorder": ['scryfall', 'mtgs', 'mythicspoiler'],
+    # prioritize images from certain sources
+    "imageorder": ['wotc', 'scryfall', 'mtgs', 'mythicspoiler'],
+    "useexclusively": '',  # if we *only* want to use one site TODO
+    "dumpXML": False,  # let travis print XML for testing
+    # if we want to debug compare scryfall to other sources, enable
+    "scryfallComparison": False,
+    "dumpErrors": True  # print the error log from out/errors.json
 }


@ -30,94 +37,114 @@ def load_json(json_file, lib_to_use):
                output_file = json.load(data_file)
            return output_file
    except Exception as ex:
-        print "Unable to load file: " +json_file+ "\nException information:\n" + str(ex.args)
-        sys.exit("Unable to load file: "+json_file)
+        print "Unable to load file: " + json_file + "\nException information:\n" + str(ex.args)
+        sys.exit("Unable to load file: " + json_file)


-setinfos = load_json('set_info','commentjson')
-manual_sets = load_json('cards_manual','json')
-card_corrections = load_json('cards_corrections','commentjson')
-delete_cards = load_json('cards_delete','commentjson')
+setinfos = load_json('set_info', 'commentjson')
+manual_sets = load_json('cards_manual', 'json')
+card_corrections = load_json('cards_corrections', 'commentjson')
+delete_cards = load_json('cards_delete', 'commentjson')

 errorlog = []

-#TODO insert configparser to add config.ini file
+# TODO insert configparser to add config.ini file
+

 def parseargs():
    for argument in sys.argv:
        for preset in presets:
-            if argument.split('=')[0].lower().replace('-','') == preset.lower():
+            if argument.split('=')[0].lower().replace('-', '') == preset.lower():
                argvalue = argument.split('=')[1]
-                if argvalue in ['true','True','T','t']:
+                if argvalue in ['true', 'True', 'T', 't']:
                    argvalue = True
-                elif argvalue in ['false','False','F','f']:
+                elif argvalue in ['false', 'False', 'F', 'f']:
                    argvalue = False
                presets[preset] = argvalue
                print "Setting preset " + preset + " to value " + str(argvalue)

+
 def save_allsets(AllSets):
    with io.open('out/AllSets.json', 'w', encoding='utf8') as json_file:
-        data = json.dumps(AllSets, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',',':'))
+        data = json.dumps(AllSets, ensure_ascii=False, encoding='utf8',
+                          indent=2, sort_keys=True, separators=(',', ':'))
        json_file.write(unicode(data))

+
 def save_masterpieces(masterpieces, setinfo):
    with open('out/' + setinfo['masterpieces']['setname'] + '.json', 'w') as outfile:
-        json.dump(masterpieces, outfile, sort_keys=True, indent=2, separators=(',', ': '))
+        json.dump(masterpieces, outfile, sort_keys=True,
+                  indent=2, separators=(',', ': '))
+

 def save_setjson(mtgs, filename):
    with io.open('out/' + filename + '.json', 'w', encoding='utf8') as json_file:
-        data = json.dumps(mtgs, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',',':'))
+        data = json.dumps(mtgs, ensure_ascii=False, encoding='utf8',
+                          indent=2, sort_keys=True, separators=(',', ':'))
        json_file.write(unicode(data))

+
 def save_errorlog(errorlog):
    with open('out/errors.json', 'w') as outfile:
-        json.dump(errorlog, outfile, sort_keys=True, indent=2, separators=(',', ': '))
+        json.dump(errorlog, outfile, sort_keys=True,
+                  indent=2, separators=(',', ': '))
+

 def save_xml(xmlstring, outfile):
    if os.path.exists(outfile):
        append_or_write = 'w'
    else:
        append_or_write = 'w'
-    with open(outfile,append_or_write) as xmlfile:
+    with open(outfile, append_or_write) as xmlfile:
        xmlfile.write(xmlstring.encode('utf-8'))

+
 if __name__ == '__main__':
    parseargs()
-    AllSets = spoilers.get_allsets() #get AllSets from mtgjson
+    AllSets = spoilers.get_allsets()  # get AllSets from mtgjson
    combinedjson = {}
    for setinfo in setinfos:
        if setinfo['setname'] in AllSets:
-            print "Found set from set_info " +setinfo['setname']+ " in MTGJSON, not adding it"
+            print "Found set from set_info " + setinfo['setname'] + " in MTGJSON, not adding it"
            continue
        if presets['oldRSS'] or 'noRSS' in setinfo and setinfo['noRSS']:
-            mtgs = { "cards":[] }
+            mtgs = {"cards": []}
        else:
-            mtgs = spoilers.scrape_mtgs('http://www.mtgsalvation.com/spoilers.rss') #scrape mtgs rss feed
-            [mtgs, split_cards] = spoilers.parse_mtgs(mtgs, [], [], [], presets['split_cards']) #parse spoilers into mtgjson format
-        mtgs = spoilers.correct_cards(mtgs, manual_sets[setinfo['setname']]['cards'], card_corrections, delete_cards) #fix using the fixfiles
-        mtgjson = spoilers.get_image_urls(mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo) #get images
+            mtgs = mtgs_scraper.scrape_mtgs(
+                'http://www.mtgsalvation.com/spoilers.rss')  # scrape mtgs rss feed
+            [mtgs, split_cards] = mtgs_scraper.parse_mtgs(
+                mtgs, [], [], [], presets['split_cards'])  # parse spoilers into mtgjson format
+        mtgs = spoilers.correct_cards(
+            mtgs, manual_sets[setinfo['setname']]['cards'], card_corrections, delete_cards)  # fix using the fixfiles
+        mtgjson = spoilers.get_image_urls(
+            mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo)  # get images
        if presets['scryfallComparison']:
-            scryfall = spoilers.get_scryfall(
+            scryfall = scryfall_scraper.get_scryfall(
                'https://api.scryfall.com/cards/search?q=++e:' + setinfo['setname'].lower())
-            mtgjson = spoilers.smash_mtgs_scryfall(mtgs, scryfall)
+            mtgjson = scryfall_scraper.smash_mtgs_scryfall(mtgs, scryfall)
        if 'fullSpoil' in setinfo and setinfo['fullSpoil']:
-            wotc = spoilers.scrape_fullspoil('', setinfo)
-            spoilers.smash_fullspoil(mtgjson, wotc)
-        [mtgjson, errors] = spoilers.error_check(mtgjson, card_corrections) #check for errors where possible
+            wotc = wizards_scraper.scrape_fullspoil('', setinfo)
+            wizards_scraper.smash_fullspoil(mtgjson, wotc)
+        [mtgjson, errors] = spoilers.error_check(
+            mtgjson, card_corrections)  # check for errors where possible
        errorlog += errors
-        spoilers.write_xml(mtgjson, setinfo['setname'], setinfo['setlongname'], setinfo['setreleasedate'])
+        spoilers.write_xml(
+            mtgjson, setinfo['setname'], setinfo['setlongname'], setinfo['setreleasedate'])
        #save_xml(spoilers.pretty_xml(setinfo['setname']), 'out/spoiler.xml')
        mtgjson = spoilers.add_headers(mtgjson, setinfo)
        AllSets = spoilers.make_allsets(AllSets, mtgjson, setinfo['setname'])
-        if 'masterpieces' in setinfo: #repeat all of the above for masterpieces
-            #masterpieces aren't in the rss feed, so for the new cards, we'll go to their individual pages on mtgs
-            #old cards will get their infos copied from mtgjson (including fields that may not apply like 'artist')
-            #the images will still come from mtgs
-            masterpieces = spoilers.make_masterpieces(setinfo['masterpieces'], AllSets, mtgjson)
+        if 'masterpieces' in setinfo:  # repeat all of the above for masterpieces
+            # masterpieces aren't in the rss feed, so for the new cards, we'll go to their individual pages on mtgs
+            # old cards will get their infos copied from mtgjson (including fields that may not apply like 'artist')
+            # the images will still come from mtgs
+            masterpieces = spoilers.make_masterpieces(
+                setinfo['masterpieces'], AllSets, mtgjson)
            [masterpieces, errors] = spoilers.error_check(masterpieces)
            errorlog += errors
-            spoilers.write_xml(masterpieces, setinfo['masterpieces']['setname'], setinfo['masterpieces']['setlongname'], setinfo['masterpieces']['setreleasedate'])
-            AllSets = spoilers.make_allsets(AllSets, masterpieces, setinfo['masterpieces']['setname'])
+            spoilers.write_xml(masterpieces, setinfo['masterpieces']['setname'],
+                               setinfo['masterpieces']['setlongname'], setinfo['masterpieces']['setreleasedate'])
+            AllSets = spoilers.make_allsets(
+                AllSets, masterpieces, setinfo['masterpieces']['setname'])
            save_masterpieces(masterpieces, setinfo)
            combinedjson[setinfo['masterpieces']['setname']] = masterpieces
        save_setjson(mtgjson, setinfo['setname'])
@ -128,7 +155,7 @@ if __name__ == '__main__':
    errorlog = spoilers.remove_corrected_errors(errorlog, card_corrections)
    save_errorlog(errorlog)
    save_allsets(AllSets)
-    #save_setjson(mtgjson)
+    # save_setjson(mtgjson)
    if presets['dumpXML']:
        print '<!----- DUMPING SPOILER.XML -----!>'
        with open('out/spoiler.xml', 'r') as xmlfile:
@ -137,7 +164,7 @@ if __name__ == '__main__':
    if presets['dumpErrors']:
        if errorlog != {}:
            print '//----- DUMPING ERROR LOG -----'
-            print json.dumps(errorlog, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',',':'))
+            print json.dumps(errorlog, ensure_ascii=False, encoding='utf8', indent=2, sort_keys=True, separators=(',', ':'))
            print '//-----   END ERROR LOG   -----'
        else:
            print "No Detected Errors!"
--- a/mtgs_scraper.py
+++ b/mtgs_scraper.py
@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+import requests
+import feedparser
+import re
+import sys
+import time
+from lxml import html
+
+
+def scrape_mtgs(url):
+    return requests.get(url, headers={'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': 'Thu, 01 Jan 1970 00:00:00 GMT'}).text
+
+
+def parse_mtgs(mtgs, manual_cards=[], card_corrections=[], delete_cards=[], split_cards={}, related_cards=[]):
+    mtgs = mtgs.replace('utf-16', 'utf-8')
+    patterns = ['<b>Name:</b> <b>(?P<name>.*?)<',
+                'Cost: (?P<cost>[X]*\d{0,2}[XWUBRGC]*?)<',
+                'Type: (?P<type>.*?)<',
+                'Pow/Tgh: (?P<pow>.*?)<',
+                'Rules Text: (?P<rules>.*?)<br /',
+                'Rarity: (?P<rarity>.*?)<',
+                'Set Number: #(?P<setnumber>.*?)/'
+                ]
+    d = feedparser.parse(mtgs)
+
+    cards = []
+    for entry in d.items()[5][1]:
+        card = dict(cost='', cmc='', img='', pow='', name='', rules='', type='',
+                    color='', altname='', colorIdentity='', colorArray=[], colorIdentityArray=[], setnumber='', rarity='')
+        summary = entry['summary']
+        for pattern in patterns:
+            match = re.search(pattern, summary, re.MULTILINE | re.DOTALL)
+            if match:
+                dg = match.groupdict()
+                card[dg.items()[0][0]] = dg.items()[0][1]
+        cards.append(card)
+
+    # if we didn't find any cards, let's bail out to prevent overwriting good data
+    count = 0
+    for card in cards:
+        count = count + 1
+    if count < 1:
+        sys.exit("No cards found, exiting to prevent file overwrite")
+
+    cards2 = []
+    for card in cards:
+        if 'rules' in card:
+            htmltags = re.compile(r'<.*?>')
+            card['rules'] = htmltags.sub('', card['rules'])
+        if '//' in card['name'] or 'Aftermath' in card['rules']:
+            print 'Splitting up Aftermath card ' + card['name']
+            card1 = card.copy()
+            card2 = dict(cost='', cmc='', img='', pow='', name='', rules='', type='',
+                         color='', altname='', colorIdentity='', colorArray=[], colorIdentityArray=[], setnumber='', rarity='')
+            if '//' in card['name']:
+                card['name'] = card['name'].replace(' // ', '//')
+                card1['name'] = card['name'].split('//')[0]
+                card2["name"] = card['name'].split('//')[1]
+            else:
+                card1['name'] = card['name']
+                card2["name"] = card['rules'].split(
+                    '\n\n')[1].strip().split(' {')[0]
+            card1['rules'] = card['rules'].split('\n\n')[0].strip()
+            card2["rules"] = "Aftermath" + card['rules'].split('Aftermath')[1]
+            card2['cost'] = re.findall(
+                r'{.*}', card['rules'])[0].replace('{', '').replace('}', '').upper()
+            card2['type'] = re.findall(
+                r'}\n.*\n', card['rules'])[0].replace('}', '').replace('\n', '')
+            if 'setnumber' in card:
+                card1['setnumber'] = card['setnumber'] + 'a'
+                card2['setnumber'] = card['setnumber'] + 'b'
+            if 'rarity' in card:
+                card2['rarity'] = card['rarity']
+            if not card1['name'] in split_cards:
+                split_cards[card1['name']] = card2['name']
+            card1['layout'] = 'aftermath'
+            card2['layout'] = 'aftermath'
+            cards2.append(card1)
+            cards2.append(card2)
+        else:
+            cards2.append(card)
+    cards = cards2
+
+    for card in cards:
+        card['name'] = card['name'].replace('&#x27;', '\'')
+        card['rules'] = card['rules'].replace('&#x27;', '\'') \
+            .replace('&lt;i&gt;', '') \
+            .replace('&lt;/i&gt;', '') \
+            .replace('&quot;', '"') \
+            .replace('blkocking', 'blocking')\
+            .replace('&amp;bull;', u'•')\
+            .replace('&bull;', u'•')\
+            .replace('comes into the', 'enters the')\
+            .replace('threeor', 'three or')\
+            .replace('[i]', '')\
+            .replace('[/i]', '')\
+            .replace('Lawlwss', 'Lawless')\
+            .replace('Costner', "Counter")
+        card['type'] = card['type'].replace('  ', ' ')\
+            .replace('Crature', 'Creature')
+
+        if card['type'][-1] == ' ':
+            card['type'] = card['type'][:-1]
+
+        if 'cost' in card and len(card['cost']) > 0:
+            workingCMC = 0
+            stripCost = card['cost'].replace('{', '').replace('}', '')
+            for manaSymbol in stripCost:
+                if manaSymbol.isdigit():
+                    workingCMC += int(manaSymbol)
+                elif not manaSymbol == 'X':
+                    workingCMC += 1
+            card['cmc'] = workingCMC
+
+        for c in 'WUBRG':  # figure out card's color
+            if c not in card['colorIdentity']:
+                if c in card['cost']:
+                    card['color'] += c
+                    card['colorIdentity'] += c
+                if (c + '}') in card['rules'] or (str.lower(c) + '}') in card['rules']:
+                    if not (c in card['colorIdentity']):
+                        card['colorIdentity'] += c
+
+    cleanedcards = []
+    for card in cards:  # let's remove any cards that are named in delete_cards array
+        if not card['name'] in delete_cards:
+            cleanedcards.append(card)
+    cards = cleanedcards
+
+    cardarray = []
+    for card in cards:
+        dupe = False
+        for dupecheck in cardarray:
+            if dupecheck['name'] == card['name']:
+                dupe = True
+        if dupe == True:
+            continue
+        for cid in card['colorIdentity']:
+            card['colorIdentityArray'].append(cid)
+        if 'W' in card['color']:
+            card['colorArray'].append('White')
+        if 'U' in card['color']:
+            card['colorArray'].append('Blue')
+        if 'B' in card['color']:
+            card['colorArray'].append('Black')
+        if 'R' in card['color']:
+            card['colorArray'].append('Red')
+        if 'G' in card['color']:
+            card['colorArray'].append('Green')
+        cardpower = ''
+        cardtoughness = ''
+        if len(card['pow'].split('/')) > 1:
+            cardpower = card['pow'].split('/')[0]
+            cardtoughness = card['pow'].split('/')[1]
+        cardnames = []
+        cardnumber = card['setnumber'].lstrip('0')
+        if card['name'] in related_cards:
+            cardnames.append(card['name'])
+            cardnames.append(related_cards[card['name']])
+            cardnumber += 'a'
+            card['layout'] = 'double-faced'
+        for namematch in related_cards:
+            if card['name'] == related_cards[namematch]:
+                card['layout'] = 'double-faced'
+                cardnames.append(namematch)
+                if not card['name'] in cardnames:
+                    cardnames.append(card['name'])
+                    cardnumber += 'b'
+        cardnames = []
+        if card['name'] in split_cards:
+            cardnames.append(card['name'])
+            cardnames.append(split_cards[card['name']])
+            cardnumber = cardnumber.replace('b', '').replace('a', '') + 'a'
+            if not 'layout' in card:
+                card['layout'] = 'split'
+        for namematch in split_cards:
+            if card['name'] == split_cards[namematch]:
+                if not 'layout' in card or ('layout' in card and card['layout'] == ''):
+                    card['layout'] = 'split'
+                cardnames.append(namematch)
+                if not card['name'] in cardnames:
+                    cardnames.append(card['name'])
+                    cardnumber = cardnumber.replace(
+                        'b', '').replace('a', '') + 'b'
+        if 'number' in card:
+            if 'b' in card['number'] or 'a' in card['number']:
+                if not 'layout' in card:
+                    print card['name'] + " has a a/b number but no 'layout'"
+        card['type'] = card['type'].replace('instant', 'Instant').replace(
+            'sorcery', 'Sorcery').replace('creature', 'Creature')
+        if '-' in card['type']:
+            subtype = card['type'].split(' - ')[1].strip()
+        else:
+            subtype = False
+        if subtype:
+            subtypes = subtype.split(' ')
+        else:
+            subtypes = False
+        if card['cmc'] == '':
+            card['cmc'] = 0
+        cardjson = {}
+        #cardjson["id"] = hashlib.sha1(setname + card['name'] + str(card['name']).lower()).hexdigest()
+        cardjson["cmc"] = card['cmc']
+        cardjson["manaCost"] = card['cost']
+        cardjson["name"] = card['name']
+        cardjson["number"] = cardnumber
+        # not sure if mtgjson has a list of acceptable rarities, but my application does
+        # so we'll warn me but continue to write a non-standard rarity (timeshifted?)
+        # may force 'special' in the future
+        if card['rarity'] not in ['Mythic Rare', 'Rare', 'Uncommon', 'Common', 'Special', 'Basic Land']:
+            #errors.append({"name": card['name'], "key": "rarity", "value": card['rarity']})
+            print card['name'] + ' has rarity = ' + card['rarity']
+        if subtypes:
+            cardjson['subtypes'] = subtypes
+        cardjson["rarity"] = card['rarity']
+        cardjson["text"] = card['rules']
+        cardjson["type"] = card['type']
+
+        workingtypes = card['type']
+        if ' - ' in workingtypes:
+            workingtypes = card['type'].split(' - ')[0]
+        cardjson['types'] = workingtypes.replace('Legendary ', '').replace('Snow ', '')\
+            .replace('Elite ', '').replace('Basic ', '').replace('World ', '').replace('Ongoing ', '')\
+            .strip().split(' ')
+        cardjson["url"] = card['img']
+
+        # optional fields
+        if len(card['colorIdentityArray']) > 0:
+            cardjson["colorIdentity"] = card['colorIdentityArray']
+        if len(card['colorArray']) > 0:
+            cardjson["colors"] = card['colorArray']
+        if len(cardnames) > 1:
+            cardjson["names"] = cardnames
+        if cardpower or cardpower == '0':
+            cardjson["power"] = cardpower
+            cardjson["toughness"] = cardtoughness
+        if card.has_key('loyalty'):
+            cardjson["loyalty"] = card['loyalty']
+        if card.has_key('layout'):
+            cardjson["layout"] = card['layout']
+
+        cardarray.append(cardjson)
+
+    return [{"cards": cardarray}, split_cards]
+
+
+def scrape_mtgs_images(url='http://www.mtgsalvation.com/spoilers/183-hour-of-devastation', mtgscardurl='http://www.mtgsalvation.com/cards/hour-of-devastation/', exemptlist=[]):
+    page = requests.get(url)
+    tree = html.fromstring(page.content)
+    cards = {}
+    cardstree = tree.xpath('//*[contains(@class, "log-card")]')
+    for child in cardstree:
+        if child.text in exemptlist:
+            continue
+        childurl = mtgscardurl + child.attrib['data-card-id'] + '-' + child.text.replace(
+            ' ', '-').replace("'", "").replace(',', '').replace('-//', '')
+        cardpage = requests.get(childurl)
+        tree = html.fromstring(cardpage.content)
+        cardtree = tree.xpath('//img[contains(@class, "card-spoiler-image")]')
+        try:
+            cardurl = cardtree[0].attrib['src']
+        except:
+            cardurl = ''
+            pass
+        cards[child.text] = {
+            "url": cardurl
+        }
+        time.sleep(.2)
+    return cards
--- a/mythic_scraper.py
+++ b/mythic_scraper.py
@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+import requests
+import time
+from bs4 import BeautifulSoup as BS
+from bs4 import Comment
+
+
+# mtgjson is optional, will ignore cards found if passed
+def get_mythic_cards(url='http://mythicspoiler.com/ixa/', mtgjson=False):
+    cards = {'cards': []}
+    r = requests.get(url)
+    soup = BS(r.text, "html.parser")
+    cardurls = soup.find_all('a', 'card')
+    urllist = []
+    for cardurl in cardurls:
+        try:
+            urllist.append(url + str(cardurl).split("href=\"")
+                           [1].split('"><img')[0])
+        except:
+            pass
+    if not mtgjson:
+        for url in urllist:
+            card = scrape_mythic_card_page(url)
+            if card != '' and 'name' in card and card['name'] != '':
+                cards['cards'].append(scrape_mythic_card_page(url))
+            time.sleep(.5)
+    else:
+        for url in urllist:
+            needsScraped = True
+            for card in mtgjson['cards']:
+                if card['name'].lower().replace(' ', '') in url:
+                    needsScraped = False
+            if needsScraped:
+                card = scrape_mythic_card_page(url)
+                if card != '' and 'name' in card and card['name'] != '':
+                    mtgjson['cards'].append(card)
+        cards = mtgjson
+
+    return cards
+
+
+def scrape_mythic_card_page(url):
+    r = requests.get(url)
+
+    soup = BS(r.text, "html.parser")
+
+    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
+
+    card = {}
+
+    for comment in comments:
+        if comment == 'CARD NAME':
+            card['name'] = comment.next_element.strip().replace('"', '')
+        elif comment == 'MANA COST':
+            try:
+                card['manaCost'] = comment.next_element.strip().replace('"', '')
+            except:
+                pass
+        elif comment == 'TYPE':
+            card['type'] = comment.next_element.strip().replace('"', '')
+        elif comment == 'CARD TEXT':
+            buildText = ''
+            for element in comment.next_elements:
+                try:
+                    if not element.strip() in ['CARD TEXT', 'FLAVOR TEXT', '']:
+                        if buildText != '':
+                            buildText += '\n'
+                        buildText += element.strip()
+                    if element.strip() == 'FLAVOR TEXT':
+                        card['text'] = buildText
+                        break
+                except:
+                    pass
+        elif comment == 'Set Number':
+            try:
+                card['number'] = comment.next_element.strip()
+            except:
+                pass
+        elif comment == 'P/T':
+            try:
+                if comment.next_element.strip().split('/')[0] != '':
+                    card['power'] = comment.next_element.strip().split('/')[0]
+                    card['toughness'] = comment.next_element.strip().split('/')[1]
+            except:
+                pass
+
+    return card
--- a/scryfall_scraper.py
+++ b/scryfall_scraper.py
@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+import requests
+import time
+
+
+def get_scryfall(setUrl):
+    #getUrl = 'https://api.scryfall.com/cards/search?q=++e:'
+    #setUrl = getUrl + setname.lower()
+    setDone = False
+    scryfall = []
+
+    while setDone == False:
+        setcards = requests.get(setUrl)
+        setcards = setcards.json()
+        if setcards.has_key('data'):
+            scryfall.append(setcards['data'])
+        else:
+            setDone = True
+            # print setUrl
+            # print setcards
+            print 'No Scryfall data'
+            scryfall = ['']
+        time.sleep(.1)
+        if setcards.has_key('has_more'):
+            if setcards['has_more'] == True:
+                setUrl = setcards['next_page']
+            else:
+                setDone = True
+        else:
+            setDone = True
+    if not scryfall[0] == '':
+        scryfall = convert_scryfall(scryfall[0])
+        return {'cards': scryfall}
+    else:
+        return {'cards': []}
+
+
+def convert_scryfall(scryfall):
+    cards2 = []
+    for card in scryfall:
+        card2 = {}
+        card2['cmc'] = int(card['cmc'])
+        if card.has_key('mana_cost'):
+            card2['manaCost'] = card['mana_cost'].replace(
+                '{', '').replace('}', '')
+        else:
+            card2['manaCost'] = ''
+        card2['name'] = card['name']
+        card2['number'] = card['collector_number']
+        card2['rarity'] = card['rarity'].replace(
+            'mythic', 'mythic rare').title()
+        if card.has_key('oracle_text'):
+            card2['text'] = card['oracle_text'].replace(
+                u"\u2014", '-').replace(u"\u2212", "-")
+        else:
+            card2['text'] = ''
+        card2['url'] = card['image_uri']
+        if not 'type_line' in card:
+            card['type_line'] = 'Unknown'
+        card2['type'] = card['type_line'].replace(u'—', '-')
+        cardtypes = card['type_line'].split(u' — ')[0].replace('Legendary ', '').replace('Snow ', '')\
+            .replace('Elite ', '').replace('Basic ', '').replace('World ', '').replace('Ongoing ', '')
+        cardtypes = cardtypes.split(' ')
+        if u' — ' in card['type_line']:
+            cardsubtypes = card['type_line'].split(u' — ')[1]
+            if ' ' in cardsubtypes:
+                card2['subtypes'] = cardsubtypes.split(' ')
+            else:
+                card2['subtypes'] = [cardsubtypes]
+        if 'Legendary' in card['type_line']:
+            if card2.has_key('supertypes'):
+                card2['supertypes'].append('Legendary')
+            else:
+                card2['supertypes'] = ['Legendary']
+        if 'Snow' in card['type_line']:
+            if card2.has_key('supertypes'):
+                card2['supertypes'].append('Snow')
+            else:
+                card2['supertypes'] = ['Snow']
+        if 'Elite' in card['type_line']:
+            if card2.has_key('supertypes'):
+                card2['supertypes'].append('Elite')
+            else:
+                card2['supertypes'] = ['Elite']
+        if 'Basic' in card['type_line']:
+            if card2.has_key('supertypes'):
+                card2['supertypes'].append('Basic')
+            else:
+                card2['supertypes'] = ['Basic']
+        if 'World' in card['type_line']:
+            if card2.has_key('supertypes'):
+                card2['supertypes'].append('World')
+            else:
+                card2['supertypes'] = ['World']
+        if 'Ongoing' in card['type_line']:
+            if card2.has_key('supertypes'):
+                card2['supertypes'].append('Ongoing')
+            else:
+                card2['supertypes'] = ['Ongoing']
+        card2['types'] = cardtypes
+        if card.has_key('color_identity'):
+            card2['colorIdentity'] = card['color_identity']
+        if card.has_key('colors'):
+            if not card['colors'] == []:
+                card2['colors'] = []
+                if 'W' in card['colors']:
+                    card2['colors'].append("White")
+                if 'U' in card['colors']:
+                    card2['colors'].append("Blue")
+                if 'B' in card['colors']:
+                    card2['colors'].append("Black")
+                if 'R' in card['colors']:
+                    card2['colors'].append("Red")
+                if 'G' in card['colors']:
+                    card2['colors'].append("Green")
+                #card2['colors'] = card['colors']
+        if card.has_key('all_parts'):
+            card2['names'] = []
+            for partname in card['all_parts']:
+                card2['names'].append(partname['name'])
+        if card.has_key('power'):
+            card2['power'] = card['power']
+        if card.has_key('toughness'):
+            card2['toughness'] = card['toughness']
+        if card.has_key('layout'):
+            if card['layout'] != 'normal':
+                card2['layout'] = card['layout']
+        if card.has_key('loyalty'):
+            card2['loyalty'] = card['loyalty']
+        if card.has_key('artist'):
+            card2['artist'] = card['artist']
+        # if card.has_key('source'):
+        #    card2['source'] = card['source']
+        # if card.has_key('rulings'):
+        #    card2['rulings'] = card['rulings']
+        if card.has_key('flavor_text'):
+            card2['flavor'] = card['flavor_text']
+        if card.has_key('multiverse_id'):
+            card2['multiverseid'] = card['multiverse_id']
+
+        cards2.append(card2)
+
+    return cards2
+
+
+def smash_mtgs_scryfall(mtgs, scryfall):
+    for mtgscard in mtgs['cards']:
+        cardFound = False
+        for scryfallcard in scryfall['cards']:
+            if scryfallcard['name'] == mtgscard['name']:
+                for key in scryfallcard:
+                    if key in mtgscard:
+                        if not mtgscard[key] == scryfallcard[key]:
+                            try:
+                                print "%s's key %s\nMTGS    : %s\nScryfall: %s" % (mtgscard['name'], key, mtgscard[key], scryfallcard[key])
+                            except:
+                                print "Error printing Scryfall vs MTGS debug info for " + mtgscard['name']
+                                pass
+                cardFound = True
+        if not cardFound:
+            print "MTGS has card %s and Scryfall does not." % mtgscard['name']
+    for scryfallcard in scryfall['cards']:
+        cardFound = False
+        for mtgscard in mtgs['cards']:
+            if scryfallcard['name'] == mtgscard['name']:
+                cardFound = True
+        if not cardFound:
+            print "Scryfall has card %s and MTGS does not." % scryfallcard['name']
+
+    return mtgs
--- a/spoilers.py
+++ b/spoilers.py
--- a/wizards_scraper.py
+++ b/wizards_scraper.py
@ -0,0 +1,252 @@
+# -*- coding: utf-8 -*-
+import requests
+from lxml import html
+from PIL import Image
+
+
+def scrape_fullspoil(url="http://magic.wizards.com/en/articles/archive/card-image-gallery/hour-devastation", setinfo={"setname": "HOU"}, showRarityColors=False, showFrameColors=False, manual_cards=[], delete_cards=[], split_cards=[]):
+    if 'setlongname' in setinfo:
+        url = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setinfo['setlongname'].lower().replace('of', '').replace(
+            '  ', ' ').replace(' ', '-')
+    page = requests.get(url)
+    tree = html.fromstring(page.content)
+    cards = []
+    cardtree = tree.xpath('//*[@id="content-detail-page-of-an-article"]')
+    for child in cardtree:
+        cardElements = child.xpath('//*/p/img')
+        cardcount = 0
+        for cardElement in cardElements:
+            card = {
+                "name": cardElement.attrib['alt'].replace(u"\u2019", '\'').split(' /// ')[0],
+                "img": cardElement.attrib['src']
+            }
+            card["url"] = card["img"]
+            #card["cmc"] = 0
+            #card["manaCost"] = ""
+            #card["type"] = ""
+            #card["types"] = []
+            #card["text"] = ""
+            #card["colorIdentity"] = [""]
+
+            # if card['name'] in split_cards:
+            #    card["names"] = [card['name'], split_cards[card['name']]]
+            #    card["layout"] = "split"
+            #notSplit = True
+            # for backsplit in split_cards:
+            #    if card['name'] == split_cards[backsplit]:
+            #        notSplit = False
+            # if not card['name'] in delete_cards:
+            cards.append(card)
+            cardcount += 1
+    fullspoil = {"cards": cards}
+    print "Spoil Gallery has " + str(cardcount) + " cards."
+    download_images(fullspoil['cards'], setinfo['setname'])
+    fullspoil = get_rarities_by_symbol(fullspoil, setinfo['setname'])
+    fullspoil = get_mana_symbols(fullspoil, setinfo['setname'])
+    #fullspoil = get_colors_by_frame(fullspoil, setinfo['setname'])
+    return fullspoil
+
+
+def get_rarities_by_symbol(fullspoil, setcode, split_cards=[]):
+    symbolPixels = (240, 219, 242, 221)
+    highVariance = 15
+    colorAverages = {
+        "Common": [30, 27, 28],
+        "Uncommon": [121, 155, 169],
+        "Rare": [166, 143, 80],
+        "Mythic Rare": [201, 85, 14]
+    }
+    symbolCount = 0
+    for card in fullspoil['cards']:
+        try:
+            cardImage = Image.open(
+                'images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg')
+        except:
+            continue
+            pass
+        if '//' in card['name']:
+            setSymbol = cardImage.crop((240, 138, 242, 140))
+        else:
+            setSymbol = cardImage.crop(symbolPixels)
+        cardHistogram = setSymbol.histogram()
+        reds = cardHistogram[0:256]
+        greens = cardHistogram[256:256 * 2]
+        blues = cardHistogram[256 * 2: 256 * 3]
+        reds = sum(i * w for i, w in enumerate(reds)) / sum(reds)
+        greens = sum(i * w for i, w in enumerate(greens)) / sum(greens)
+        blues = sum(i * w for i, w in enumerate(blues)) / sum(blues)
+        variance = 768
+        for color in colorAverages:
+            colorVariance = 0
+            colorVariance = colorVariance + \
+                abs(colorAverages[color][0] - reds)
+            colorVariance = colorVariance + \
+                abs(colorAverages[color][1] - greens)
+            colorVariance = colorVariance + \
+                abs(colorAverages[color][2] - blues)
+            if colorVariance < variance:
+                variance = colorVariance
+                card['rarity'] = color
+        if variance > highVariance:
+            # if a card isn't close to any of the colors, it's probably a planeswalker? make it mythic.
+            print card['name'], 'has high variance of', variance, ', closest rarity is', card['rarity']
+            card['rarity'] = "Mythic Rare"
+            # print card['name'], '$', reds, greens, blues
+            if symbolCount < 10:
+                setSymbol.save(
+                    'images/' + card['name'].replace(' // ', '') + '.symbol.jpg')
+                symbolCount += 1
+    return fullspoil
+
+
+def get_colors_by_frame(fullspoil, setcode, split_cards={}):
+    framePixels = (20, 11, 76, 16)
+    highVariance = 10
+    colorAverages = {
+        "White": [231, 225, 200],
+        "Blue": [103, 193, 230],
+        "Black": [58, 61, 54],
+        "Red": [221, 122, 101],
+        "Green": [118, 165, 131],
+        "Multicolor": [219, 200, 138],
+        "Artifact": [141, 165, 173],
+        "Colorless": [216, 197, 176],
+    }
+    symbolCount = 0
+    for card in fullspoil['cards']:
+        try:
+            cardImage = Image.open(
+                'images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg')
+        except:
+            continue
+            pass
+        cardColor = cardImage.crop(framePixels)
+
+        cardHistogram = cardColor.histogram()
+        reds = cardHistogram[0:256]
+        greens = cardHistogram[256:256 * 2]
+        blues = cardHistogram[256 * 2: 256 * 3]
+        reds = sum(i * w for i, w in enumerate(reds)) / sum(reds)
+        greens = sum(i * w for i, w in enumerate(greens)) / sum(greens)
+        blues = sum(i * w for i, w in enumerate(blues)) / sum(blues)
+        variance = 768
+        for color in colorAverages:
+            colorVariance = 0
+            colorVariance = colorVariance + \
+                abs(colorAverages[color][0] - reds)
+            colorVariance = colorVariance + \
+                abs(colorAverages[color][1] - greens)
+            colorVariance = colorVariance + \
+                abs(colorAverages[color][2] - blues)
+            if colorVariance < variance:
+                variance = colorVariance
+                card['colors'] = [color]
+    return fullspoil
+
+
+def get_mana_symbols(fullspoil={}, setcode="HOU", split_cards=[]):
+    manaBoxes = [(234, 23, 244, 33), (220, 23, 230, 33),
+                 (206, 23, 216, 33), (192, 23, 202, 33), (178, 23, 188, 33)]
+    highVariance = 0
+    colorAverages = {
+        "W": [126, 123, 110],
+        "U": [115, 140, 151],
+        "B": [105, 99, 98],
+        "R": [120, 89, 77],
+        "G": [65, 78, 69],
+        "1": [162, 156, 154],
+        "2": [155, 148, 147],
+        "3": [160, 153, 152],
+        "4": [149, 143, 141],
+        "5": [155, 149, 147],
+        "6": [151, 145, 143],
+        "7": [169, 163, 161],
+        "X": [160, 154, 152]
+    }
+    for card in fullspoil['cards']:
+        try:
+            cardImage = Image.open(
+                'images/' + setcode + '/' + card['name'].replace(' // ', '') + '.jpg')
+        except:
+            continue
+            pass
+        card['manaCost'] = ""
+        for manaBox in manaBoxes:
+            manaSymbol = cardImage.crop(manaBox)
+            cardHistogram = manaSymbol.histogram()
+            reds = cardHistogram[0:256]
+            greens = cardHistogram[256:256 * 2]
+            blues = cardHistogram[256 * 2: 256 * 3]
+            reds = sum(i * w for i, w in enumerate(reds)) / sum(reds)
+            greens = sum(i * w for i, w in enumerate(greens)) / sum(greens)
+            blues = sum(i * w for i, w in enumerate(blues)) / sum(blues)
+            variance = 768
+            for color in colorAverages:
+                colorVariance = 0
+                colorVariance = colorVariance + \
+                    abs(colorAverages[color][0] - reds)
+                colorVariance = colorVariance + \
+                    abs(colorAverages[color][1] - greens)
+                colorVariance = colorVariance + \
+                    abs(colorAverages[color][2] - blues)
+                if colorVariance < variance:
+                    variance = colorVariance
+                    closestColor = color
+            if variance < 10:
+                # if card['name'] in ["Mirage Mirror", "Uncage the Menagerie", "Torment of Hailfire"]:
+                #    print card['name'] + " " + str(reds) + " " + str(greens) + " " + str(blues)
+                if closestColor in ["2", "5"]:
+                    twoVSfive = (
+                        manaBox[0] + 1, manaBox[1] + 4, manaBox[2] - 5, manaBox[3] - 2)
+                    manaSymbol = cardImage.crop(twoVSfive)
+                    cardHistogram = manaSymbol.histogram()
+                    reds = cardHistogram[0:256]
+                    greens = cardHistogram[256:256 * 2]
+                    blues = cardHistogram[256 * 2: 256 * 3]
+                    reds = sum(
+                        i * w for i, w in enumerate(reds)) / sum(reds)
+                    greens = sum(
+                        i * w for i, w in enumerate(greens)) / sum(greens)
+                    blues = sum(
+                        i * w for i, w in enumerate(blues)) / sum(blues)
+                    variance = 768
+                    colorVariance = 0
+                    colorVariance = colorVariance + abs(175 - reds)
+                    colorVariance = colorVariance + abs(168 - greens)
+                    colorVariance = colorVariance + abs(166 - blues)
+                    if colorVariance < 10:
+                        closestColor = "2"
+                    elif colorVariance > 110 and colorVariance < 120:
+                        closestColor = "5"
+                    else:
+                        continue
+                card['manaCost'] = closestColor + card['manaCost']
+    return fullspoil
+
+
+def smash_fullspoil(mtgjson, fullspoil):
+    different_keys = {}
+    for mtgjson_card in mtgjson['cards']:
+        for fullspoil_card in fullspoil['cards']:
+            if mtgjson_card['name'] == fullspoil_card['name']:
+                for key in fullspoil_card:
+                    if key in mtgjson_card:
+                        if mtgjson_card[key] != fullspoil_card[key] and key != 'colors':
+                            if not fullspoil_card['name'] in different_keys:
+                                different_keys[fullspoil_card['name']] = {
+                                    key: fullspoil_card[key]}
+                            else:
+                                different_keys[fullspoil_card['name']
+                                               ][key] = fullspoil_card[key]
+    for fullspoil_card in fullspoil['cards']:
+        WOTC_only = []
+        match = False
+        for mtgjson_card in mtgjson['cards']:
+            if mtgjson_card['name'] == fullspoil_card['name']:
+                match = True
+        if not match:
+            WOTC_only.append(fullspoil_card['name'])
+    if len(WOTC_only) > 0:
+        print "WOTC only cards: "
+        print WOTC_only
+    print different_keys