Merge pull request #9 from tritoch/mythic-scraper

Mythic scraper
This commit is contained in:
tritoch 2017-06-16 21:35:03 -05:00 committed by GitHub
commit 52b0b42cb3
2 changed files with 82 additions and 0 deletions

View File

@ -3,3 +3,4 @@ feedparser
lxml
Pillow
datetime
beautifulsoup4

View File

@ -12,6 +12,9 @@ import datetime
import urllib
import json
import xml.dom.minidom
from bs4 import BeautifulSoup as BS
from bs4 import Comment
def scrape_mtgs(url):
return requests.get(url, headers={'Cache-Control':'no-cache', 'Pragma':'no-cache', 'Expires': 'Thu, 01 Jan 1970 00:00:00 GMT'}).text
@ -1146,3 +1149,81 @@ def add_headers(mtgjson, setinfos):
if 'blockname' in setinfos:
mtgjson2['block'] = setinfos['blockname']
return mtgjson2
def get_mythic_cards(url='http://mythicspoiler.com/ixa/', mtgjson=False): #mtgjson is optional, will ignore cards found if passed
cards = {'cards':[]}
r = requests.get(url)
soup = BS(r.text, "html.parser")
cardurls = soup.find_all('a', 'card')
urllist = []
for cardurl in cardurls:
try:
urllist.append(url + str(cardurl).split("href=\"")[1].split('"><img')[0])
except:
pass
if not mtgjson:
for url in urllist:
card = scrape_mythic_card_page(url)
if card != '' and 'name' in card and card['name'] != '':
cards['cards'].append(scrape_mythic_card_page(url))
time.sleep(.5)
else:
for url in urllist:
needsScraped = True
for card in mtgjson['cards']:
if card['name'].lower().replace(' ','') in url:
needsScraped = False
if needsScraped:
card = scrape_mythic_card_page(url)
if card != '' and 'name' in card and card['name'] != '':
mtgjson['cards'].append(card)
cards = mtgjson
return cards
def scrape_mythic_card_page(url):
r = requests.get(url)
soup = BS(r.text, "html.parser")
comments = soup.find_all(string=lambda text:isinstance(text,Comment))
card = {}
for comment in comments:
if comment == 'CARD NAME':
card['name'] = comment.next_element.strip().replace('"','')
elif comment == 'MANA COST':
try:
card['manaCost'] = comment.next_element.strip().replace('"','')
except:
pass
elif comment == 'TYPE':
card['type'] = comment.next_element.strip().replace('"','')
elif comment == 'CARD TEXT':
buildText = ''
for element in comment.next_elements:
try:
if not element.strip() in ['CARD TEXT', 'FLAVOR TEXT', '']:
if buildText != '':
buildText += '\n'
buildText += element.strip()
if element.strip() == 'FLAVOR TEXT':
card['text'] = buildText
break
except:
pass
elif comment == 'Set Number':
try:
card['number'] = comment.next_element.strip()
except:
pass
elif comment == 'P/T':
try:
if comment.next_element.strip().split('/')[0] != '':
card['power'] = comment.next_element.strip().split('/')[0]
card['toughness'] = comment.next_element.strip().split('/')[1]
except:
pass
return card