BBS图片自动下载的脚本实现(v1.0)


                                      今天Graduate版聚归来,已是晚上10点过了,昨天刚写了一个在水源上自动发文的脚本,所以又突然想写一个自动下载图片的Python脚本。
      虽然类似的工具早有人写过(叫BBSPicSpider),不过它是用.NET来做的。反正就当学习Python的练手,于是开始试着编写。过程还是比较顺利,不过当中遇到很多小问题,查了很多Python的库文档加上百度,得以解决。今天太晚了,过两天再总结一下。
      先把代码贴上了,不过这个是1.0版本,基本功能已经可以实现了,不过有一点小BUG,有待进一步研究改进,有兴趣的朋友可以交流一下:)
[color="#804040"]  1 [color="#0000ff"]#!/usr/bin/python
[color="#804040"]  2
[color="#804040"]  3 [color="#0000ff"]# Download pictures from PPPerson @ bbs.sjtu.edu.cn
[color="#804040"]  4
[color="#804040"]  5 [color="#a020f0"]import re
[color="#804040"]  6 [color="#a020f0"]import os
[color="#804040"]  7 [color="#a020f0"]from urllib  [color="#a020f0"]import ContentTooShortError
[color="#804040"]  8 [color="#a020f0"]from urllib2 [color="#a020f0"]import URLError
[color="#804040"]  9 [color="#a020f0"]import urllib, urllib2
[color="#804040"] 10
[color="#804040"] 11 [color="#804040"]def [color="#008080"]requestURL( url, datas, headers = None ) :
[color="#804040"] 12     """[color="#ff00ff"]Request a url"""
[color="#804040"] 13         
[color="#804040"] 14     [color="#804040"]if [color="#804040"]not headers :
[color="#804040"] 15         headers = { '[color="#ff00ff"]User-Agent' : '[color="#ff00ff"]Mozilla/3.0' }
[color="#804040"] 16     [color="#0000ff"]# request
[color="#804040"] 17     req = urllib2.Request( url, datas, headers )
[color="#804040"] 18     [color="#0000ff"]# open url
[color="#804040"] 19     [color="#804040"]try :
[color="#804040"] 20         res = urllib2.urlopen( req )
[color="#804040"] 21     [color="#804040"]except URLError, e :
[color="#804040"] 22         [color="#804040"]if hasattr( e, '[color="#ff00ff"]reason' ) :
[color="#804040"] 23             [color="#804040"]print "[color="#ff00ff"]Failed to reach server: ", e.reason
[color="#804040"] 24         [color="#804040"]elif hasattr( e, '[color="#ff00ff"]code' ) :
[color="#804040"] 25             [color="#804040"]print "[color="#ff00ff"]Can't fulfill the requset: ",e.code
[color="#804040"] 26     [color="#804040"]else :
[color="#804040"] 27         [color="#804040"]pass [color="#0000ff"]#print "Requset is successful\n"
[color="#804040"] 28
[color="#804040"] 29     [color="#804040"]return res
[color="#804040"] 30
[color="#804040"] 31 [color="#804040"]def [color="#008080"]getSubjectsURL( cmpPattern ) :
[color="#804040"] 32     """[color="#ff00ff"]Get the url of subjects"""
[color="#804040"] 33
[color="#804040"] 34     board_url = '[color="#ff00ff"]http://bbs.sjtu.edu.cn/bbstdoc'
[color="#804040"] 35
[color="#804040"] 36     [color="#0000ff"]# baord
[color="#804040"] 37     data = { '[color="#ff00ff"]board' : '[color="#ff00ff"]PPPerson' }
[color="#804040"] 38     datas = urllib.urlencode( data )
[color="#804040"] 39
[color="#804040"] 40     [color="#0000ff"]# get a response
[color="#804040"] 41     res = requestURL( board_url, datas )
[color="#804040"] 42
[color="#804040"] 43     [color="#0000ff"]# create a compiled regular expression
[color="#804040"] 44     [color="#0000ff"]#cmpPattern = re.compile(r'')
[color="#804040"] 45    
[color="#804040"] 46     [color="#0000ff"]# get the url list of titles
[color="#804040"] 47     subjectsURL = cmpPattern.findall( res.read() )
[color="#804040"] 48
[color="#804040"] 49     [color="#804040"]return subjectsURL
[color="#804040"] 50
[color="#804040"] 51 [color="#804040"]def [color="#008080"]getImagesURL( url, cmpPattern ) :
[color="#804040"] 52     """[color="#ff00ff"]Return the images url according to corresponding subject url"""
[color="#804040"] 53
[color="#804040"] 54     [color="#0000ff"]# get the url of subject
[color="#804040"] 55     subject_url = '[color="#ff00ff"]http://bbs.sjtu.edu.cn/bbstcon'
[color="#804040"] 56     [color="#0000ff"]# get datas
[color="#804040"] 57     datas = url
[color="#804040"] 58
[color="#804040"] 59     [color="#0000ff"]# request and get a response
[color="#804040"] 60     res = requestURL( subject_url, datas )
[color="#804040"] 61    
[color="#804040"] 62     [color="#0000ff"]# create a compiled pattern to find urls of images
[color="#804040"] 63     [color="#0000ff"]#cmpPattern = re.compile( r'', re.I )
[color="#804040"] 64
[color="#804040"] 65     [color="#0000ff"]# get the urls of images
[color="#804040"] 66     imagesURL = cmpPattern.findall( res.read() )
[color="#804040"] 67
[color="#804040"] 68     [color="#0000ff"]# delete the repeated images
[color="#804040"] 69     sets = set( imagesURL )
[color="#804040"] 70     imagesURL = []
[color="#804040"] 71     [color="#804040"]for item [color="#804040"]in sets :
[color="#804040"] 72         imagesURL.append( item )
[color="#804040"] 73
[color="#804040"] 74     [color="#804040"]return imagesURL
[color="#804040"] 75
[color="#804040"] 76 [color="#804040"]def [color="#008080"]downloadImage( imageURL, subID ) :
[color="#804040"] 77     """[color="#ff00ff"]Download images"""
[color="#804040"] 78
[color="#804040"] 79     [color="#0000ff"]# image url
[color="#804040"] 80     image_url = '[color="#ff00ff"]http://bbs.sjtu.edu.cn' + imageURL
[color="#804040"] 81
[color="#804040"] 82     [color="#0000ff"]# create the directory to store images
[color="#804040"] 83     [color="#0000ff"]# if not os.path.exists( './download' ) :
[color="#804040"] 84     [color="#804040"]try :
[color="#804040"] 85         os.makedirs( '[color="#ff00ff"]./download/' + subID )
[color="#804040"] 86     [color="#804040"]except OSError :
[color="#804040"] 87         [color="#804040"]pass
[color="#804040"] 88         [color="#0000ff"]#print "Failed to create directories"
[color="#804040"] 89
[color="#804040"] 90    
[color="#804040"] 91     [color="#0000ff"]# get filename of image
[color="#804040"] 92     filename = '[color="#ff00ff"]download/' + subID + '[color="#ff00ff"]/' + imageURL.split( '[color="#ff00ff"]/' )[-1]
[color="#804040"] 93
[color="#804040"] 94     [color="#0000ff"]# clear the cache that may have been built up
[color="#804040"] 95     [color="#0000ff"]# by previous calls to urlretrieve()
[color="#804040"] 96     urllib.urlcleanup()
[color="#804040"] 97    
[color="#804040"] 98     [color="#0000ff"]# retrieve the image
[color="#804040"] 99     [color="#804040"]try :
[color="#804040"]100         urllib.urlretrieve( image_url, filename )
[color="#804040"]101     [color="#804040"]except ContentTooShortError :
[color="#804040"]102         [color="#804040"]print "[color="#ff00ff"]The data available was less than that of expected"
[color="#804040"]103         [color="#804040"]print "[color="#ff00ff"]Downloading file %s was interrupted" [color="#6a5acd"]\
[color="#804040"]104                         % os.path.basename( filename )
[color="#804040"]105     [color="#804040"]else :
[color="#804040"]106         [color="#0000ff"]# get the size of file
[color="#804040"]107         size = os.path.getsize( filename ) / 1024
[color="#804040"]108         [color="#804040"]print "[color="#ff00ff"]>>>File %s (%s Kb) was done..." % ( filename, size )
[color="#804040"]109
[color="#804040"]110
[color="#804040"]111 [color="#804040"]if __name__ == '[color="#ff00ff"]__main__' :
[color="#804040"]112    
[color="#804040"]113     [color="#0000ff"]# create compiled regular expression pattern
[color="#804040"]114     findSubjectsPattern = re.compile( [color="#6a5acd"]\
[color="#804040"]115                     r'[color="#ff00ff"](\d+).*?', re.I | re.DOTALL )
[color="#804040"]116     findImagesPattern   = re.compile( r'[color="#ff00ff"]', re.I )
[color="#804040"]117
[color="#804040"]118     [color="#0000ff"]# get subjects' url list
[color="#804040"]119     subjectsList = getSubjectsURL( findSubjectsPattern )
[color="#804040"]120
[color="#804040"]121     [color="#804040"]print "[color="#ff00ff"]Downloading begins...[color="#6a5acd"]\n"
[color="#804040"]122
[color="#804040"]123     filecount = 1
[color="#804040"]124
[color="#804040"]125     [color="#804040"]for i [color="#804040"]in range( len( subjectsList ) ) :
[color="#804040"]126         [color="#0000ff"]# get images url list
[color="#804040"]127         [color="#804040"]print "[color="#6a5acd"]\n[color="#ff00ff"]Subject %s begins..." % subjectsList[0]
[color="#804040"]128         imagesList = getImagesURL( subjectsList[1], findImagesPattern )
[color="#804040"]129         [color="#0000ff"]# download all iamges
[color="#804040"]130         [color="#804040"]for j [color="#804040"]in range( len(imagesList) ) :
[color="#804040"]131             downloadImage( imagesList[j], subjectsList[0] )
[color="#804040"]132             filecount += 1
[color="#804040"]133
[color="#804040"]134     [color="#804040"]print "[color="#6a5acd"]\n[color="#ff00ff"]All downloads were done"
[color="#804040"]135     [color="#804040"]print "[color="#ff00ff"]%d files were downloaded totally[color="#6a5acd"]\n" % filecount
[color="#804040"]136