Thread Tools Display Modes
03-10-14, 01:42 AM   #1
xxauroraxx
A Murloc Raider
Join Date: Mar 2014
Posts: 5
Database crawler

Thought this would be helpful in case anyone wants to create a local repository of items

E: Parser would be better terminology...

This script goes through all pages on either Blizzard's or Wowhead's site and retrieves all information about every item with an ID in the range start and finish and stores the information in a pickled dictionary.

Python 2.7
Code:
import asyncore
import string, socket
import StringIO
import mimetools, urlparse
import pickle
import re

start=1000
finish=100000
blizzard = "http://us.battle.net/wow/en/item/"
wowhead = "http://www.wowhead.com/item="
url=blizzard

class AsyncHTTP(asyncore.dispatcher_with_send):
    # HTTP requestor

    def __init__(self, uri, consumer):
        asyncore.dispatcher_with_send.__init__(self)

        self.uri = uri
        self.consumer = consumer

        # turn the uri into a valid request
        scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
        assert scheme == "http", "only supports HTTP requests"
        try:
            host, port = string.split(host, ":", 1)
            port = int(port)
        except (TypeError, ValueError):
            port = 80 # default port
        if not path:
            path = "/"
        if params:
            path = path + ";" + params
        if query:
            path = path + "?" + query

        self.request = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (path, host)

        self.host = host
        self.port = port

        self.status = None
        self.header = None

        self.data = ""

        # get things going!
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect((host, port))

    def handle_connect(self):
        # connection succeeded
        self.send(self.request)

    def handle_expt(self):
        # connection failed; notify consumer (status is None)
        self.close()
        try:
            http_header = self.consumer.http_header
        except AttributeError:
            pass
        else:
            http_header(self)

    def handle_read(self):
        data = self.recv(2048)
        if not self.header:
            self.data = self.data + data
            try:
                i = string.index(self.data, "\r\n\r\n")
            except ValueError:
                return # continue
            else:
                # parse header
                fp = StringIO.StringIO(self.data[:i+4])
                # status line is "HTTP/version status message"
                status = fp.readline()
                self.status = string.split(status, " ", 2)
                # followed by a rfc822-style message header
                self.header = mimetools.Message(fp)
                # followed by a newline, and the payload (if any)
                data = self.data[i+4:]
                self.data = ""
                # notify consumer (status is non-zero)
                try:
                    http_header = self.consumer.http_header
                except AttributeError:
                    pass
                else:
                    http_header(self)
                if not self.connected:
                    return # channel was closed by consumer

        self.consumer.feed(data)

    def handle_close(self):
        #self.consumer.close()
        self.close()

class DummyConsumer:
    size = 0
    text = ''

    def http_header(self, request):
        # handle header
        if request.status is None:
            print "connection failed"

    def feed(self, data):
        # handle incoming data
        self.size = self.size + len(data)
        self.text = self.text + data

    #def close(self):
        # end of data
        #print self.size, "bytes in body"
        #print self.text

#
# try it out

itemCounter = start
while itemCounter < finish:
    consumer = DummyConsumer()
    consumer.text = ''
    request = AsyncHTTP(
        "%s"%str(url)+str(itemCounter),
        consumer
        )

    asyncore.loop()
    print "%s"%str(url)+str(itemCounter)
    itemCounter = itemCounter+1
    itemDB = {}
    log = open('log.txt','a')

    x = consumer.text
    if '<b class="q' in x:
        print 'FOUND AN ITEM'
        name = x.split('<b class="q')
        x = x.replace(name[0], '')
        name[1] = name[1].replace(name[1][0:3], '')
        name = name[1].split('</b>')[0]        
        itemDB[name] = []
        x = x.replace(name, '')
        x = x.split("ge('icon")[0]
        x = x.rstrip(' \t\n\r')
        results = re.compile('>(.*?)<', re.DOTALL | re.IGNORECASE).findall(x)
        for y in results:
            if len(y) > 1 and '\n' not in y:
                itemDB[name].append(y)
        print 'Adding %s : item %s with attributes:'%(name, itemCounter)
        log.write('Adding %s : item %s with attributes:'%(name, itemCounter))
        for x in itemDB[name]:
            print ' ' + x
            log.write(' ' + x)
        print '\n'
        log.write('\n')
        
    log.write("%s"%str(url)+str(itemCounter) + '\n')
    log.close
            
log.close
str_path = open('itemdatabase.db', 'wb')
pickle.dump(itemDB, str_path)
str_path.close()
print "Complete and written to 'itemdatabase.db'!"

Last edited by xxauroraxx : 03-10-14 at 01:47 AM.
  Reply With Quote
03-14-14, 08:45 AM   #2
Choonstertwo
A Chromatic Dragonspawn
 
Choonstertwo's Avatar
AddOn Author - Click to view addons
Join Date: Jan 2011
Posts: 194
Interesting. I downloaded the Windows build of Python 2.7.6 to try this out, but it wasn't working for me. After adding a few print statements to the main loop, I determined that consumer.text is always an empty string. Am I doing something wrong?

This is the version I'm using:
Code:
import asyncore
import string, socket
import StringIO
import mimetools, urlparse
import pickle
import re

start=13359
finish=13370
blizzard = "http://us.battle.net/wow/en/item/"
wowhead = "http://www.wowhead.com/item="
url=wowhead
outputdir = "G:/Documents/MiscScripts/WoWItemScraper/"

class AsyncHTTP(asyncore.dispatcher_with_send):
    # HTTP requestor

    def __init__(self, uri, consumer):
        asyncore.dispatcher_with_send.__init__(self)

        self.uri = uri
        self.consumer = consumer

        # turn the uri into a valid request
        scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
        assert scheme == "http", "only supports HTTP requests"
        try:
            host, port = string.split(host, ":", 1)
            port = int(port)
        except (TypeError, ValueError):
            port = 80 # default port
        if not path:
            path = "/"
        if params:
            path = path + ";" + params
        if query:
            path = path + "?" + query

        self.request = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (path, host)

        self.host = host
        self.port = port

        self.status = None
        self.header = None

        self.data = ""

        # get things going!
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect((host, port))

    def handle_connect(self):
        # connection succeeded
        self.send(self.request)

    def handle_expt(self):
        # connection failed; notify consumer (status is None)
        self.close()
        try:
            http_header = self.consumer.http_header
        except AttributeError:
            pass
        else:
            http_header(self)

    def handle_read(self):
        data = self.recv(2048)
        if not self.header:
            self.data = self.data + data
            try:
                i = string.index(self.data, "\r\n\r\n")
            except ValueError:
                return # continue
            else:
                # parse header
                fp = StringIO.StringIO(self.data[:i+4])
                # status line is "HTTP/version status message"
                status = fp.readline()
                self.status = string.split(status, " ", 2)
                # followed by a rfc822-style message header
                self.header = mimetools.Message(fp)
                # followed by a newline, and the payload (if any)
                data = self.data[i+4:]
                self.data = ""
                # notify consumer (status is non-zero)
                try:
                    http_header = self.consumer.http_header
                except AttributeError:
                    pass
                else:
                    http_header(self)
                if not self.connected:
                    return # channel was closed by consumer

        self.consumer.feed(data)

    def handle_close(self):
        #self.consumer.close()
        self.close()

class DummyConsumer:
    size = 0
    text = ''

    def http_header(self, request):
        # handle header
        if request.status is None:
            print "connection failed"

    def feed(self, data):
        # handle incoming data
        self.size = self.size + len(data)
        self.text = self.text + data

    #def close(self):
        # end of data
        #print self.size, "bytes in body"
        #print self.text

#
# try it out

itemCounter = start
while itemCounter < finish:
    consumer = DummyConsumer()
    consumer.text = ''
	
    request = AsyncHTTP(
        "%s"%str(url)+str(itemCounter),
        consumer
        )

    asyncore.loop()
    print "%s"%str(url)+str(itemCounter)
    itemCounter = itemCounter+1
    itemDB = {}
    log = open(outputdir + 'log.txt','a')

    x = consumer.text
    print "Result: %d length, %s" % (len(x), x[0:10])
	
    if '<b class="q' in x:
        print 'FOUND AN ITEM'
        name = x.split('<b class="q')
        x = x.replace(name[0], '')
        name[1] = name[1].replace(name[1][0:3], '')
        name = name[1].split('</b>')[0]        
        itemDB[name] = []
        x = x.replace(name, '')
        x = x.split("ge('icon")[0]
        x = x.rstrip(' \t\n\r')
        results = re.compile('>(.*?)<', re.DOTALL | re.IGNORECASE).findall(x)
        for y in results:
            if len(y) > 1 and '\n' not in y:
                itemDB[name].append(y)
        print 'Adding %s : item %s with attributes:'%(name, itemCounter)
        log.write('Adding %s : item %s with attributes:'%(name, itemCounter))
        for x in itemDB[name]:
            print ' ' + x
            log.write(' ' + x)
        print '\n'
        log.write('\n')
        
    log.write("%s"%str(url)+str(itemCounter) + '\n')
    log.close
            
log.close
str_path = open(outputdir + 'itemdatabase.db', 'wb')
pickle.dump(itemDB, str_path)
str_path.close()
print "Complete and written to '%sitemdatabase.db'!" % outputdir
All I've done to it is add an output directory for the log and DB and add the print "Result: %d length, %s" % (len(x), x[0:10]) statement.

On a side note, since Battle.net uses a completely different HTML layout for its item pages than Wowhead, the script won't recognise the start of an item box in Battle.net item pages (Battle.net uses <h2 class="color-q*"> for its item names instead of <b class="q*"> [where * is a numeric quality index]).

Finally, is there any particular reason you're scraping web pages instead of using the official item data web APIs provided by Blizzard and Wowhead?
  Reply With Quote
03-14-14, 09:13 AM   #3
Duugu
Premium Member
 
Duugu's Avatar
AddOn Author - Click to view addons
Join Date: Nov 2006
Posts: 851
I'm just curious ... wouldn't an ingame query via GetItemInfo/scanning the tooltip deliver the same data?
  Reply With Quote
03-14-14, 09:18 AM   #4
xxauroraxx
A Murloc Raider
Join Date: Mar 2014
Posts: 5
Originally Posted by Duugu View Post
I'm just curious ... wouldn't an ingame query via GetItemInfo/scanning the tooltip deliver the same data?
Actually yes. I didn't see the functions for this at the time, but that would do it.
  Reply With Quote
03-14-14, 09:21 AM   #5
xxauroraxx
A Murloc Raider
Join Date: Mar 2014
Posts: 5
Originally Posted by Choonstertwo View Post
Interesting. I downloaded the Windows build of Python 2.7.6 to try this out, but it wasn't working for me. After adding a few print statements to the main loop, I determined that consumer.text is always an empty string. Am I doing something wrong?

This is the version I'm using:
Code:
import asyncore
import string, socket
import StringIO
import mimetools, urlparse
import pickle
import re

start=13359
finish=13370
blizzard = "http://us.battle.net/wow/en/item/"
wowhead = "http://www.wowhead.com/item="
url=wowhead
outputdir = "G:/Documents/MiscScripts/WoWItemScraper/"

class AsyncHTTP(asyncore.dispatcher_with_send):
    # HTTP requestor

    def __init__(self, uri, consumer):
        asyncore.dispatcher_with_send.__init__(self)

        self.uri = uri
        self.consumer = consumer

        # turn the uri into a valid request
        scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
        assert scheme == "http", "only supports HTTP requests"
        try:
            host, port = string.split(host, ":", 1)
            port = int(port)
        except (TypeError, ValueError):
            port = 80 # default port
        if not path:
            path = "/"
        if params:
            path = path + ";" + params
        if query:
            path = path + "?" + query

        self.request = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (path, host)

        self.host = host
        self.port = port

        self.status = None
        self.header = None

        self.data = ""

        # get things going!
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect((host, port))

    def handle_connect(self):
        # connection succeeded
        self.send(self.request)

    def handle_expt(self):
        # connection failed; notify consumer (status is None)
        self.close()
        try:
            http_header = self.consumer.http_header
        except AttributeError:
            pass
        else:
            http_header(self)

    def handle_read(self):
        data = self.recv(2048)
        if not self.header:
            self.data = self.data + data
            try:
                i = string.index(self.data, "\r\n\r\n")
            except ValueError:
                return # continue
            else:
                # parse header
                fp = StringIO.StringIO(self.data[:i+4])
                # status line is "HTTP/version status message"
                status = fp.readline()
                self.status = string.split(status, " ", 2)
                # followed by a rfc822-style message header
                self.header = mimetools.Message(fp)
                # followed by a newline, and the payload (if any)
                data = self.data[i+4:]
                self.data = ""
                # notify consumer (status is non-zero)
                try:
                    http_header = self.consumer.http_header
                except AttributeError:
                    pass
                else:
                    http_header(self)
                if not self.connected:
                    return # channel was closed by consumer

        self.consumer.feed(data)

    def handle_close(self):
        #self.consumer.close()
        self.close()

class DummyConsumer:
    size = 0
    text = ''

    def http_header(self, request):
        # handle header
        if request.status is None:
            print "connection failed"

    def feed(self, data):
        # handle incoming data
        self.size = self.size + len(data)
        self.text = self.text + data

    #def close(self):
        # end of data
        #print self.size, "bytes in body"
        #print self.text

#
# try it out

itemCounter = start
while itemCounter < finish:
    consumer = DummyConsumer()
    consumer.text = ''
	
    request = AsyncHTTP(
        "%s"%str(url)+str(itemCounter),
        consumer
        )

    asyncore.loop()
    print "%s"%str(url)+str(itemCounter)
    itemCounter = itemCounter+1
    itemDB = {}
    log = open(outputdir + 'log.txt','a')

    x = consumer.text
    print "Result: %d length, %s" % (len(x), x[0:10])
	
    if '<b class="q' in x:
        print 'FOUND AN ITEM'
        name = x.split('<b class="q')
        x = x.replace(name[0], '')
        name[1] = name[1].replace(name[1][0:3], '')
        name = name[1].split('</b>')[0]        
        itemDB[name] = []
        x = x.replace(name, '')
        x = x.split("ge('icon")[0]
        x = x.rstrip(' \t\n\r')
        results = re.compile('>(.*?)<', re.DOTALL | re.IGNORECASE).findall(x)
        for y in results:
            if len(y) > 1 and '\n' not in y:
                itemDB[name].append(y)
        print 'Adding %s : item %s with attributes:'%(name, itemCounter)
        log.write('Adding %s : item %s with attributes:'%(name, itemCounter))
        for x in itemDB[name]:
            print ' ' + x
            log.write(' ' + x)
        print '\n'
        log.write('\n')
        
    log.write("%s"%str(url)+str(itemCounter) + '\n')
    log.close
            
log.close
str_path = open(outputdir + 'itemdatabase.db', 'wb')
pickle.dump(itemDB, str_path)
str_path.close()
print "Complete and written to '%sitemdatabase.db'!" % outputdir
All I've done to it is add an output directory for the log and DB and add the print "Result: %d length, %s" % (len(x), x[0:10]) statement.

On a side note, since Battle.net uses a completely different HTML layout for its item pages than Wowhead, the script won't recognise the start of an item box in Battle.net item pages (Battle.net uses <h2 class="color-q*"> for its item names instead of <b class="q*"> [where * is a numeric quality index]).

Finally, is there any particular reason you're scraping web pages instead of using the official item data web APIs provided by Blizzard and Wowhead?
First, yeah I forgot the formatting for items was quite different.

And yes. I don't play on retail WoW, so the data I would be scraping wouldn't match per version.

I couldn't explain why consumer.text is a null value. I was having some trouble getting it to work as well over the loop. I just ended up testing the asynchronus connection to make SURE I was able to pull data at all from another page, then ended up rewriting the logic of when it pulls. I didn't identify where the real issue was, but I believe it was because I created the consumer object outside of the while loop.

Last edited by xxauroraxx : 03-14-14 at 09:27 AM.
  Reply With Quote
03-14-14, 09:50 AM   #6
xxauroraxx
A Murloc Raider
Join Date: Mar 2014
Posts: 5
I just did a few tests, and it seems as if wowhead is either disconnecting while trying to return data, or it returns nothing. I did post this script on the wowhead site so I wouldn't put it past the admins to disallow this type of crawling to prevent DOS, etc.

Blizzard's site works fine, the DB I use for my version of WOW works fine.
  Reply With Quote
03-14-14, 11:49 AM   #7
Vlad
A Molten Giant
 
Vlad's Avatar
AddOn Author - Click to view addons
Join Date: Dec 2005
Posts: 793
Noe that the game DBC actually already has a lot of item data.

Depending on the type of crawling you need to do, often you might have enough data by reading what files Blizzard use for their internal client database. Just saying!
__________________
Profile: Curse | Wowhead
  Reply With Quote
03-14-14, 04:27 PM   #8
Phanx
Cat.
 
Phanx's Avatar
AddOn Author - Click to view addons
Join Date: Mar 2006
Posts: 5,617
Originally Posted by xxauroraxx View Post
And yes. I don't play on retail WoW, so the data I would be scraping wouldn't match per version.
You should probably not mention that kind of thing, as any discussion of private servers is forbidden on this site, and is likely to get your thread locked.
__________________
Retired author of too many addons.
Message me if you're interested in taking over one of my addons.
Don’t message me about addon bugs or programming questions.
  Reply With Quote
03-14-14, 05:23 PM   #9
Sharparam
A Flamescale Wyrmkin
 
Sharparam's Avatar
AddOn Author - Click to view addons
Join Date: Oct 2011
Posts: 102
Also, there is a RESTful API to query item data on battle.net: http://blizzard.github.io/api-wow-docs/

Example for castlebreaker bracers: http://eu.battle.net/api/wow/item/103759
(Replace eu with us for US server data)

Edit: Similarly, Wowhead has its own thing as well, adding &xml to any wowhead url will give you the data in XML, without all the page markup: http://wowhead.com/item=103759&xml

Last edited by Sharparam : 03-14-14 at 05:43 PM.
  Reply With Quote
03-16-14, 12:49 AM   #10
JDoubleU00
A Firelord
 
JDoubleU00's Avatar
AddOn Author - Click to view addons
Join Date: Mar 2008
Posts: 463
My question is for self education. I tried to Google for code to read and display a pickled DB, but me with few examples I could tweak. Would you mind posting some code that reads the db file you create? I realize there are better ways to do this, but I am dabbling in Python.
  Reply With Quote
03-16-14, 05:39 AM   #11
xxauroraxx
A Murloc Raider
Join Date: Mar 2014
Posts: 5
This will look for all items with a specific slot, and stat (such as intellect, or increased spell damage).

Code:
import pickle

myFile = open('itemdatabase.db', 'r')
myDB = pickle.load(myFile)
myFile.close

myDir = "D:/_itemsearches/"

while True:

    myItems = []
    
    count = 0
    slot = raw_input('Enter slot to look for: ')
    for k,v in myDB.iteritems():
        for x in v:
            if slot.lower() in x.lower():
                count += 1
                myItems.append(k)
    if count < 1:
        print '\tNo items found with a slot type of "%s"'%slot
    else:
                
        count = 0
        stat = raw_input('Enter stat to look for: ')
        
        myFile = "%s_%s-with-%s.txt"%(myDir,slot.lower(),stat.lower())
        fileIO = open(myFile, 'w')
        fileIO.write('\n')
        fileIO.close
        
        fileIO = open(myFile, 'a')
        for x in myItems:
            for y in myDB[x]:
                newStat = 'blank'
                if stat.lower() in ['agility', 'stamina', 'intellect', 'spirit', 'agility']:                
                    if stat.lower() in y.lower() and '+' in y:
                        count += 1                    
                        newStat = y.split('+')[1].split(' ')[0] + ' ' + stat.lower()
                else:
                    if stat.lower() in y.lower():
                        newStat = y
                        #print "%s : %s"%(x, newStat)
                if newStat != 'blank':
                    fileIO.write("%s\n\t%s\n"%(x, newStat))
                    
                    
        if count < 1:
            print "\tNo %s items found with %s"%(slot, stat)
        fileIO.close 
        
        fileIO = open(myFile, 'r')
        for x in fileIO:
            print x
        fileIO.close
This finds every different stat on an item. If the item has set bonuses, other items part of the set will not display as a stat.

Code:
import pickle

myFile = open('itemdatabase.db', 'r')
myDB = pickle.load(myFile)
myFile.close

myDir = "D:/_itemsearches/"
myFile = "%squeryable-stats.txt"%myDir
fileIO = open(myFile, 'w')
fileIO.write('\n')
fileIO.close

allStats = []
allItems = []
for k,v in myDB.iteritems():
    allItems.append(k)
for k,v in myDB.iteritems():
    for x in v:
        if x not in allStats and x not in allItems:
            allStats.append(x)

fileIO = open(myFile, 'a')            
for x in allStats:
    fileIO.write(x)
    fileIO.write('\n')
    print x
    print '\n'
fileIO.close
In both examples, we first create the file, and make sure it is empty. Then, we append all data.
  Reply With Quote
03-17-14, 09:49 AM   #12
JDoubleU00
A Firelord
 
JDoubleU00's Avatar
AddOn Author - Click to view addons
Join Date: Mar 2008
Posts: 463
Originally Posted by xxauroraxx View Post
This will look for all items with a specific slot, and stat (such as intellect, or increased spell damage).

Code:
import pickle

myFile = open('itemdatabase.db', 'r')
myDB = pickle.load(myFile)
myFile.close

myDir = "D:/_itemsearches/"

while True:

    myItems = []
    
    count = 0
    slot = raw_input('Enter slot to look for: ')
    for k,v in myDB.iteritems():
        for x in v:
            if slot.lower() in x.lower():
                count += 1
                myItems.append(k)
    if count < 1:
        print '\tNo items found with a slot type of "%s"'%slot
    else:
                
        count = 0
        stat = raw_input('Enter stat to look for: ')
        
        myFile = "%s_%s-with-%s.txt"%(myDir,slot.lower(),stat.lower())
        fileIO = open(myFile, 'w')
        fileIO.write('\n')
        fileIO.close
        
        fileIO = open(myFile, 'a')
        for x in myItems:
            for y in myDB[x]:
                newStat = 'blank'
                if stat.lower() in ['agility', 'stamina', 'intellect', 'spirit', 'agility']:                
                    if stat.lower() in y.lower() and '+' in y:
                        count += 1                    
                        newStat = y.split('+')[1].split(' ')[0] + ' ' + stat.lower()
                else:
                    if stat.lower() in y.lower():
                        newStat = y
                        #print "%s : %s"%(x, newStat)
                if newStat != 'blank':
                    fileIO.write("%s\n\t%s\n"%(x, newStat))
                    
                    
        if count < 1:
            print "\tNo %s items found with %s"%(slot, stat)
        fileIO.close 
        
        fileIO = open(myFile, 'r')
        for x in fileIO:
            print x
        fileIO.close
This finds every different stat on an item. If the item has set bonuses, other items part of the set will not display as a stat.

Code:
import pickle

myFile = open('itemdatabase.db', 'r')
myDB = pickle.load(myFile)
myFile.close

myDir = "D:/_itemsearches/"
myFile = "%squeryable-stats.txt"%myDir
fileIO = open(myFile, 'w')
fileIO.write('\n')
fileIO.close

allStats = []
allItems = []
for k,v in myDB.iteritems():
    allItems.append(k)
for k,v in myDB.iteritems():
    for x in v:
        if x not in allStats and x not in allItems:
            allStats.append(x)

fileIO = open(myFile, 'a')            
for x in allStats:
    fileIO.write(x)
    fileIO.write('\n')
    print x
    print '\n'
fileIO.close
In both examples, we first create the file, and make sure it is empty. Then, we append all data.
Thank you, I now have some tinkering to do.
  Reply With Quote

WoWInterface » Developer Discussions » Dev Tools » Database crawler

Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

vB code is On
Smilies are On
[IMG] code is On
HTML code is Off