Thread Pool Spider Issue...

RayNorth · May 19, 2010

I am running this code:

Code:

'''
This is a skeletal working web spider, with virtually no error-checking.
You need to pass in an URL that points to a directory (e.g. 
[URL unfurl="true"]http://www.foo.com/bar/).[/URL]

This version uses a thread pool, passing in each URL to be retrieved to 
a queue and getting back the list of links in another queue.  It's more 
efficient than the brute-force version because threads are re-used and
because there's no polling.
'''


import sys
import string
import urllib
import urlparse
import htmllib
import formatter
from cStringIO import StringIO
import threading
import Queue
import time



MAX_THREADS = 3






def xor(a,b):
    from operator import truth
    return truth(a) ^ truth(b)



class Token:
    def __init__(self, URL=None, shutdown=None):
        if not xor(URL, shutdown):
            raise "Tsk, tsk, need to set either URL or shutdown (not both)"
        self.URL = URL
        self.shutdown = shutdown



class Retriever(threading.Thread):
    def __init__(self, inputQueue, outputQueue):
        threading.Thread.__init__(self)
        self.inputQueue = inputQueue
        self.outputQueue = outputQueue

    def run(self):
        while 1:
            token = self.inputQueue.get()
            if token.shutdown:
                break
            else:
                self.URL = token.URL
                self.getPage()
                self.outputQueue.put(self.getLinks())

    def getPage(self):
        print "Retrieving:", self.URL
        self.page = urllib.urlopen(self.URL)
        self.body = self.page.read()
        self.page.close()
        self.parse()

    def getLinks(self):
        # Handle relative links
        links = []
        for link in self.parser.anchorlist:
	        links.append( urlparse.urljoin(self.URL, link) )
        return links

    def parse(self):
        # We're using the parser just to get the HREFs
        # We should also use it to e.g. respect <META NOFOLLOW>
        w = formatter.DumbWriter(StringIO())
        f = formatter.AbstractFormatter(w)
        self.parser = htmllib.HTMLParser(f)
        self.parser.feed(self.body)
        self.parser.close()



class RetrievePool:
    def __init__(self, numThreads):
        self.retrievePool = []
        self.inputQueue = Queue.Queue()
        self.outputQueue = Queue.Queue()
        for i in range(numThreads):
            retriever = Retriever(self.inputQueue, self.outputQueue)
            retriever.start()
            self.retrievePool.append(retriever)

    def put(self, URL):
        self.inputQueue.put(Token(URL=URL))

    def get(self):
        return self.outputQueue.get()

    def shutdown(self):
        for i in self.retrievePool:
            self.inputQueue.put(Token(shutdown=1))
        for thread in self.retrievePool:
            thread.join()



class Spider:
    def __init__(self, startURL, maxThreads):
        self.URLs = []
        self.queue = [startURL]
        self.URLdict = {startURL: 1}
        self.include = startURL
        self.numPagesQueued = 0
        self.retriever = RetrievePool(maxThreads)

    def checkInclude(self, URL):
        return string.find(URL, self.include) == 0

    def run(self):
        self.startPages()
        while self.numPagesQueued > 0:
            self.queueLinks()
            self.startPages()
        self.retriever.shutdown()
        self.URLs = self.URLdict.keys()
        self.URLs.sort()

    def startPages(self):
        while self.queue:
            URL = self.queue.pop()
            self.retriever.put(URL)
            self.numPagesQueued += 1

    def queueLinks(self):
        links = self.retriever.get()
        self.processLinks(links)
        self.numPagesQueued -= 1

    def processLinks(self, links):
        for link in links:
            print "Checking:", link
            # Make sure this is a new URL and is within the current site
            if ( not self.URLdict.has_key(link) ) and self.checkInclude(link):
                self.URLdict[link] = 1
                self.queue.append(link)


if __name__ == '__main__':
    startURL = sys.argv[1]
    spider = Spider(startURL, MAX_THREADS)
    spider.run()
    print
    for URL in spider.URLs:
        print URL

But I am getting this traceback...

Code:

>>> 
Traceback (most recent call last):
  File "C:\Python26\Lib\site-packages\pythonwin\pywin\framework\scriptutils.py", line 325, in RunScript
    exec codeObject in __main__.__dict__
  File "C:\Python26\Scripts\foo3.py", line 155, in <module>
    startURL = sys.argv[1]
IndexError: list index out of range
>>>

JustinEzequiel · May 20, 2010

probably because you are running it from the PythonWin editor and are not supplying a URL as an argument?

RayNorth · May 21, 2010

Justin:

I went to command prompt:

typed:

Python foo3.py

http://www.thewebsiteIamlookingup.com

I get tracebacks to the following:
line 39 for class Token
line 40 for in Token --> URL = None because in the prompt I am telling the prompt what to insert...

Tek-Tips is the largest IT community on the Internet today!

Members share and learn making Tek-Tips Forums the best source of peer-reviewed technical information on the Internet!

Thread Pool Spider Issue...

RayNorth

Technical User

JustinEzequiel

Programmer

RayNorth

Technical User

Similar threads

Part and Inventory Search

Sponsor