PYTHON核心编程第二版例题

tlnxj

UID: 12187
帖子: 175
积分: 402
在线时间: 1 天 18 小时

1^# tlnxj 发表于 2008-11-18 18:04

PYTHON核心编程第二版例题

第二十章例题crawl总是不能通过，根据出错提示又不能定位错误（新手），各位大侠能否给个调试过程攻略。最好给个PDB用法的，据说它是个通用法子。不过方法不限，能快速解决问题的法子就是好法子。
另，python关于调试方面的文章很少，讲解很浅，请各个推荐个系统的文章。下面是那个程序
#!/usr/bin/env python

from sys  import argv

from os import makedirs, unlink , sep

from os.path  import  dirname, exists, isdir, splitext

from string import  replace, find, lower

from htmllib  import  HTMLParser

from urllib import urlretrieve

from urlparse  import  urlparse, urljoin

from formatter  import  DumbWriter, AbstractFormatter

from cStringIO import  StringIO

class Retriever(object): # download Web pages

def __init__(self, url):

      self.url = url

      self.file = self.filename(url)

def filename(self,url,deffile = 'index.htm'):

      parsedurl = urlparse(url,'http:',0) ##parse path

      path = parsedurl[1] + parsedurl[2]

      ext = splitext(path)



      if ext[1] == '': #no file ,use default

         if path[-1] == '/':

            path += deffile

         else:

            path +='/' + deffile

      ldir = dirname(path) #local directory



      if sep !='/': # os-indep.path separator

         ldir = replace(ldir , '/',sep)

      if not isdir(ldir): #create archive dir if nec.

         if exists(ldir): unlink(ldir)

         makedirs(ldir)

         print ldir

      return path

      #print  path



def download(self):

      try:

         retval= urlretrieve(self.url,self.file)

      except IOError:

         retval = ('*** ERROR: invalid URL "%s"' %\

                  self.url,)

         return retval

def parseAndGetLinks(self): #parse HTML ,save links

      self.parser = HTMLParser(AbstractFormatter(\

         DumbWriter(StringIO())))

      self.parser.feed(open(self.file).read())

      self.parser.close()

      return self.parser.anchorlist

class Crawler(object): #manage entire crawling process

count = 0 # statim downloaded page counter

def __init__(self,url):

      self.q = [url]

      self.seen = []

      self.dom = urlparse(url)[1]

def getPage(self, url):

      r = Retriever(url)

      retval = r.download()

      print retval

      if retval[0] == '*':

      #error situation, do not parse print retval ,'...skipping parse'

         print retval, '...skipping parse'

         return

      Crawler.count += 1

      print '\n(', Crawler.count,')'

      print 'URL:', url

      print 'FILE:', retval[0]

      self.seen.append(url)

      links = r.parseAndGetLinks() #get and process links

      for eachLink in links:

         if eachLink[:4] !='http' and \

            find(eachLink,'://') == -1:

            eachLink = urljoin(url,eachLink)

         print '* ',eachLink,

         if find(lower(eachLink), 'mailto:') !=-1:

            print '...discarded , mailto link'

            continue

         if eachLink not in self.seen:

            if find(eachLink, self.dom) == -1:

                  print '...discarded , not in domain'

            else:

                  if eachLink not in self.q:

                     self.q.append(eachLink)

                     print '...new , added to Q'

                  else:

                     print '...discarded , already in Q'

         else:

            print '...discarded , already processed.'





def go(self): #process links in queue

      while self.q:

         url = self.q.pop()

         self.getPage(url)

def main():

if len(argv) > 1:

      url = argv[1]

else:

      try:

         url = raw_input('Enter starting URL:')



      except (KeyboarInterrupt ,EOFError):

         url = ''



if not url : return

robot = Crawler(url)

robot.go()

if __name__=='__main__':

main()

luffy.deng

UID: 17648
帖子: 1
积分: 2
在线时间: 10 分钟

2^# luffy.deng 发表于 2008-11-18 19:12

QUOTE:

原帖由 tlnxj 于 2008-11-18 18:04 发表

def download(self):
      try:
         retval= urlretrieve(self.url,self.file)
      except IOError:
         retval = ('*** ERROR: invalid URL "%s"' %\
                  self.url,)
         return retval

def download(self):
      try:
         retval= urlretrieve(self.url,self.file)
      except IOError:
         retval = ('*** ERROR: invalid URL "%s"' %\
                  self.url,)
      return retval
因该是缩进的问题

luffy.deng

UID: 17648
帖子: 1
积分: 2
在线时间: 10 分钟

3^# luffy.deng 发表于 2008-11-18 19:15

这是我的输出：

('10.60.33.177/index.html', <httplib.HTTPMessage instance at 0x00CF14B8>)
( 1 )
URL: http://10.60.33.177/
FILE: 10.60.33.177/index.html

tlnxj

UID: 12187
帖子: 175
积分: 402
在线时间: 1 天 18 小时

4^# tlnxj 发表于 2008-11-18 20:07

Enter starting URL:http://www.sina.com.cn/index.html
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://218.60.32.24/index.html
218.60.32.24
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>

tlnxj

UID: 12187
帖子: 175
积分: 402
在线时间: 1 天 18 小时

5^# tlnxj 发表于 2008-11-18 20:08

Enter starting URL:http://www.sina.com.cn/index.html
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://218.60.32.24/index.html
218.60.32.24
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>

tlnxj

UID: 12187
帖子: 175
积分: 402
在线时间: 1 天 18 小时

6^# tlnxj 发表于 2008-11-18 20:11

Enter starting URL:http://www.sina.com.cn/index.html
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://218.60.32.24/index.html
218.60.32.24
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>
Enter starting URL:http://www.sina.com.cn/
None
Traceback (most recent call last):
  File "/home/apple/crawl.py", line 128, in <module>
main()
  File "/home/apple/crawl.py", line 125, in main
robot.go()
  File "/home/apple/crawl.py", line 110, in go
self.getPage(url)
  File "/home/apple/crawl.py", line 72, in getPage
if retval[0] == '*':
TypeError: 'NoneType' object is unsubscriptable
>>>

luffy.deng

UID: 17648
帖子: 1
积分: 2
在线时间: 10 分钟

7^# luffy.deng 发表于 2008-11-18 20:35

retval 值为空了。 download里 return retval 缩进不对

tlnxj

UID: 12187
帖子: 175
积分: 402
在线时间: 1 天 18 小时

8^# tlnxj 发表于 2008-11-18 21:22

谢谢，这个问题困扰了我两个月，自我感觉应该不是大问题，自己就是找不出来，差点都放弃python了，你的解答让我有了一丝继续下去的勇气。
问一句，您熟练应用PYTHON用了多长时间？

alan_yang

UID: 5743
帖子: 14
积分: 32
在线时间: 10 分钟

9^# alan_yang 发表于 2008-11-19 09:15

QUOTE:

原帖由 tlnxj 于 2008-11-18 21:22 发表
谢谢，这个问题困扰了我两个月，自我感觉应该不是大问题，自己就是找不出来，差点都放弃python了，你的解答让我有了一丝继续下去的勇气。
问一句，您熟练应用PYTHON用了多长时间？

:em06: :em06: :em06: :em06:

tlnxj

UID: 12187
帖子: 175
积分: 402
在线时间: 1 天 18 小时

10^# tlnxj 发表于 2008-11-20 17:59

就是这样，自己周围无人交流，只好自己琢磨。网上大侠们天马行空，自己这个超级菜鸟的问题肯定不能入大侠们的法眼。要不是实在不甘舍弃PYTHON这么好的东西，也不会拿这个问题麻烦大家。