Scrape MTGS for images. Fix WOTC URL (#36)

* Scrape MTGS for images

* Fix WOTC card gallery URL.

* Additional Aftermath Fixes
This commit is contained in:
tritoch 2017-06-21 12:44:39 -05:00 committed by GitHub
parent ab75dd2104
commit 6d21eb5ca8
3 changed files with 51 additions and 11 deletions

View File

@ -101,7 +101,7 @@ if __name__ == '__main__':
[mtgs, split_cards] = spoilers.parse_mtgs(mtgs, [], [], [], presets['split_cards']) #parse spoilers into mtgjson format
mtgs = spoilers.correct_cards(mtgs, manual_sets[setinfo['setname']]['cards'], card_corrections, delete_cards) #fix using the fixfiles
scryfall = spoilers.get_scryfall('https://api.scryfall.com/cards/search?q=++e:' + setinfo['setname'].lower())
mtgs = spoilers.get_image_urls(mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize']) #get images
mtgs = spoilers.get_image_urls(mtgs, presets['isfullspoil'], setinfo['setname'], setinfo['setlongname'], setinfo['setsize'], setinfo) #get images
mtgjson = spoilers.smash_mtgs_scryfall(mtgs, scryfall)
[mtgjson, errors] = spoilers.error_check(mtgjson, card_corrections) #check for errors where possible
errorlog += errors

View File

@ -5,12 +5,14 @@
"setsize": 199,
"setreleasedate": "2017-07-14",
"settype": "expansion",
"mtgsurl": "http://www.mtgsalvation.com/spoilers/183-hour-of-devastation",
"mtgscardpath": "http://www.mtgsalvation.com/cards/hour-of-devastation/",
"masterpieces": {
"setname": "MPS_AKH",
"setlongname": "Masterpiece Series: Amonkhet Invocations",
"setreleasedate": "2017-04-28",
"alternativeNames": ["Amonkhet Invocations"],
"galleryURL": "http://magic.wizards.com/en/articles/archive/card-preview/masterpiece-series-amonkhet-invocations-2017-03-29",
"galleryURL": "http://magic.wizards.com/en/articles/archive/feature/masterpiece-series-hour-devastation-invocations-2017-06-19",
"additionalCardNames": [],
"mtgsurl": "http://www.mtgsalvation.com/spoilers/181-amonkhet-invocations",
"mtgscardpath": "http://www.mtgsalvation.com/cards/amonkhet-invocations/"

View File

@ -54,15 +54,19 @@ def parse_mtgs(mtgs, manual_cards=[], card_corrections=[], delete_cards=[], spli
if 'rules' in card:
htmltags = re.compile(r'<.*?>')
card['rules'] = htmltags.sub('', card['rules'])
if '//' in card['name']:
if '//' in card['name'] or 'Aftermath' in card['rules']:
print 'Splitting up Aftermath card ' + card['name']
card['name'] = card['name'].replace(' // ','//')
card1 = card.copy()
card1['name'] = card['name'].split('//')[0]
card1['rules'] = card['rules'].split('\n\n')[0]
card2 = dict(cost='',cmc='',img='',pow='',name='',rules='',type='',
color='', altname='', colorIdentity='', colorArray=[], colorIdentityArray=[], setnumber='', rarity='')
card2["name"] = card['name'].split('//')[1]
if '//' in card['name']:
card['name'] = card['name'].replace(' // ','//')
card1['name'] = card['name'].split('//')[0]
card2["name"] = card['name'].split('//')[1]
else:
card1['name'] = card['name']
card2["name"] = card['rules'].split('\n\n')[1].strip().split(' {')[0]
card1['rules'] = card['rules'].split('\n\n')[0].strip()
card2["rules"] = "Aftermath" + card['rules'].split('Aftermath')[1]
card2['cost'] = re.findall(r'{.*}',card['rules'])[0].replace('{','').replace('}','').upper()
card2['type'] = re.findall(r'}\n.*\n', card['rules'])[0].replace('}','').replace('\n','')
@ -739,16 +743,17 @@ def get_colors_by_frame(fullspoil, split_cards=[]):
# symbolCount += 1
return fullspoil
def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269):
def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269, setinfo=False):
IMAGES = 'http://magic.wizards.com/en/content/' + setlongname.lower().replace(' ', '-') + '-cards'
IMAGES2 = 'http://mythicspoiler.com/newspoilers.html'
IMAGES3 = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setlongname.lower().replace(' ', '-')
IMAGES3 = 'http://magic.wizards.com/en/articles/archive/card-image-gallery/' + setlongname.lower().replace('of','').replace(' ',' ').replace(' ', '-')
text = requests.get(IMAGES).text
text2 = requests.get(IMAGES2).text
text3 = requests.get(IMAGES3).text
wotcpattern = r'<img alt="{}.*?" src="(?P<img>.*?\.png)"'
mythicspoilerpattern = r' src="' + setname.lower() + '/cards/{}.*?.jpg">'
WOTC = []
for c in mtgjson['cards']:
match = re.search(wotcpattern.format(c['name'].replace('\'','&rsquo;')), text, re.DOTALL)
if match:
@ -768,10 +773,43 @@ def get_image_urls(mtgjson, isfullspoil, setname, setlongname, setSize=269):
pass
#if ('Creature' in c['type'] and not c.has_key('power')) or ('Vehicle' in c['type'] and not c.has_key('power')):
# print(c['name'] + ' is a creature w/o p/t img: ' + c['url'])
if len(str(c['url'])) < 10:
print(c['name'] + ' has no image.')
if 'wizards.com' in c['url']:
WOTC.append(c['name'])
if setinfo:
if 'mtgsurl' in setinfo and 'mtgscardpath' in setinfo:
mtgsImages = scrape_mtgs_images(setinfo['mtgsurl'], setinfo['mtgscardpath'], WOTC)
for card in mtgjson['cards']:
if card['name'] in mtgsImages:
card['url'] = mtgsImages[card['name']]['url']
for card in mtgjson['cards']:
if len(str(card['url'])) < 10:
print(card['name'] + ' has no image.')
return mtgjson
def scrape_mtgs_images(url='http://www.mtgsalvation.com/spoilers/183-hour-of-devastation', mtgscardurl='http://www.mtgsalvation.com/cards/hour-of-devastation/', exemptlist=[]):
page = requests.get(url)
tree = html.fromstring(page.content)
cards = {}
cardstree = tree.xpath('//*[contains(@class, "log-card")]')
for child in cardstree:
if child.text == 'Reason' or child.text in exemptlist:
continue
childurl = mtgscardurl + child.attrib['data-card-id'] + '-' + child.text.replace(' ','-').replace("'","").replace(',','').replace('-//','')
cardpage = requests.get(childurl)
tree = html.fromstring(cardpage.content)
cardtree = tree.xpath('//img[contains(@class, "card-spoiler-image")]')
try:
cardurl = cardtree[0].attrib['src']
except:
cardurl = ''
pass
cards[child.text] = {
"url": cardurl
}
time.sleep(.2)
return cards
def write_xml(mtgjson, setname, setlongname, setreleasedate, split_cards=[]):
if not os.path.isdir('out/'):
os.makedirs('out/')