PYTHON核心编程第二版例题

PYTHON核心编程第二版例题

第二十章例题crawl总是不能通过,根据出错提示又不能定位错误(新手),各位大侠能否给个调试过程攻略。最好给个PDB用法的,据说它是个通用法子。不过方法不限,能快速解决问题的法子就是好法子。
另,python关于调试方面的文章很少,讲解很浅,请各个推荐个系统的文章。下面是那个程序
#!/usr/bin/env python



from sys  import   argv

from os   import   makedirs, unlink , sep

from os.path  import  dirname, exists, isdir, splitext

from string   import  replace, find, lower

from htmllib  import  HTMLParser

from urllib   import   urlretrieve

from urlparse  import  urlparse, urljoin

from formatter  import  DumbWriter, AbstractFormatter

from cStringIO   import  StringIO



class Retriever(object): # download Web pages

   

    def __init__(self, url):

        self.url = url

        self.file = self.filename(url)



    def filename(self,url,deffile = 'index.htm'):

        parsedurl = urlparse(url,'http:',0) ##parse path

        path = parsedurl[1] + parsedurl[2]

        ext = splitext(path)

        

        if ext[1] == '': #no file ,use default

            if path[-1] == '/':

                path += deffile

            else:

                path +='/' + deffile



        ldir = dirname(path) #local directory



        

        if sep !='/': # os-indep.path separator

            ldir = replace(ldir , '/',sep)



        if not isdir(ldir): #create archive dir if nec.

            if exists(ldir): unlink(ldir)

            makedirs(ldir)

            print ldir

        return path



        #print  path

      

    def download(self):

        try:

            retval= urlretrieve(self.url,self.file)

        except IOError:

            retval = ('*** ERROR: invalid URL "%s"' %\

                      self.url,)

            return retval



    def parseAndGetLinks(self): #parse HTML ,save links

        self.parser = HTMLParser(AbstractFormatter(\

            DumbWriter(StringIO())))

        self.parser.feed(open(self.file).read())

        self.parser.close()

        return self.parser.anchorlist



class Crawler(object): #manage entire crawling process

   

    count = 0 # statim downloaded page counter



    def __init__(self,url):

        self.q = [url]

        self.seen = []

        self.dom = urlparse(url)[1]



    def getPage(self, url):

        r = Retriever(url)   

        retval = r.download()

        print retval

        if retval[0] == '*':

        #error situation, do not parse print retval ,'...skipping parse'

            print retval, '...skipping parse'

            return



        Crawler.count += 1

        print '\n(', Crawler.count,')'

        print 'URL:', url

        print 'FILE:', retval[0]

        self.seen.append(url)



        links = r.parseAndGetLinks() #get and process links

        for eachLink in links:

            if eachLink[:4] !='http' and \

               find(eachLink,'://') == -1:

                eachLink = urljoin(url,eachLink)

            print '* ',eachLink,



            if find(lower(eachLink), 'mailto:') !=-1:

                print '...discarded , mailto link'

                continue



            if eachLink not in self.seen:

                if find(eachLink, self.dom) == -1:

                    print '...discarded , not in domain'

                else:

                    if eachLink not in self.q:

                        self.q.append(eachLink)

                        print '...new , added to Q'

                    else:

                        print '...discarded , already in Q'

            else:

                print '...discarded , already processed.'

        

                           

    def go(self): #process links in queue

        while self.q:

            url = self.q.pop()

            self.getPage(url)



def main():

    if len(argv) > 1:

        url = argv[1]



    else:

        try:

            url = raw_input('Enter starting URL:')

            

        except (KeyboarInterrupt ,EOFError):

            url = ''

      

    if not url : return

    robot = Crawler(url)

    robot.go()



if __name__=='__main__':

    main()


QUOTE:
原帖由 tlnxj 于 2008-11-18 18:04 发表

    def download(self):
        try:
            retval= urlretrieve(self.url,self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' %\
                      self.url,)
            return retval

def download(self):
        try:
            retval= urlretrieve(self.url,self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' %\
                      self.url,)
        return retval
因该是缩进的问题
这是我的输出:

('10.60.33.177/index.html', <httplib.HTTPMessage instance at 0x00CF14B8>)
( 1 )
URL: http://10.60.33.177/
FILE: 10.60.33.177/index.html
Enter starting URL:http://www.sina.com.cn/index.html
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://218.60.32.24/index.html
218.60.32.24
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/index.html
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://218.60.32.24/index.html
218.60.32.24
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/index.html
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://218.60.32.24/index.html
218.60.32.24
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
    main()
  File "/home/apple/crawl.py", line 125, in main
    robot.go()
  File "/home/apple/crawl.py", line 110, in go
    self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
    if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
retval  值为空了 。 download里  return retval 缩进不对
谢谢,这个问题困扰了我两个月,自我感觉应该不是大问题,自己就是找不出来,差点都放弃python了,你的解答让我有了一丝继续下去的勇气。
问一句,您熟练应用PYTHON用了多长时间?


QUOTE:
原帖由 tlnxj 于 2008-11-18 21:22 发表
谢谢,这个问题困扰了我两个月,自我感觉应该不是大问题,自己就是找不出来,差点都放弃python了,你的解答让我有了一丝继续下去的勇气。
问一句,您熟练应用PYTHON用了多长时间?

:em06: :em06: :em06: :em06:
就是这样,自己周围无人交流,只好自己琢磨。网上大侠们天马行空,自己这个超级菜鸟的问题肯定不能入大侠们的法眼。要不是实在不甘舍弃PYTHON这么好的东西,也不会拿这个问题麻烦大家。