BBS图片自动下载的脚本实现(v1.0)
taoyuliang
|
1#
taoyuliang 发表于 2008-11-16 03:01
BBS图片自动下载的脚本实现(v1.0)今天Graduate版聚归来,已是晚上10点过了,昨天刚写了一个在水源上自动发文的脚本,所以又突然想写一个自动下载图片的Python脚本。 虽然类似的工具早有人写过(叫BBSPicSpider),不过它是用.NET来做的。反正就当学习Python的练手,于是开始试着编写。过程还是比较顺利,不过当中遇到很多小问题,查了很多Python的库文档加上百度,得以解决。今天太晚了,过两天再总结一下。 先把代码贴上了,不过这个是1.0版本,基本功能已经可以实现了,不过有一点小BUG,有待进一步研究改进,有兴趣的朋友可以交流一下:) [color="#804040"] 1 [color="#0000ff"]#!/usr/bin/python [color="#804040"] 2 [color="#804040"] 3 [color="#0000ff"]# Download pictures from PPPerson @ bbs.sjtu.edu.cn [color="#804040"] 4 [color="#804040"] 5 [color="#a020f0"]import re [color="#804040"] 6 [color="#a020f0"]import os [color="#804040"] 7 [color="#a020f0"]from urllib [color="#a020f0"]import ContentTooShortError [color="#804040"] 8 [color="#a020f0"]from urllib2 [color="#a020f0"]import URLError [color="#804040"] 9 [color="#a020f0"]import urllib, urllib2 [color="#804040"] 10 [color="#804040"] 11 [color="#804040"]def [color="#008080"]requestURL( url, datas, headers = None ) : [color="#804040"] 12 """[color="#ff00ff"]Request a url""" [color="#804040"] 13 [color="#804040"] 14 [color="#804040"]if [color="#804040"]not headers : [color="#804040"] 15 headers = { '[color="#ff00ff"]User-Agent' : '[color="#ff00ff"]Mozilla/3.0' } [color="#804040"] 16 [color="#0000ff"]# request [color="#804040"] 17 req = urllib2.Request( url, datas, headers ) [color="#804040"] 18 [color="#0000ff"]# open url [color="#804040"] 19 [color="#804040"]try : [color="#804040"] 20 res = urllib2.urlopen( req ) [color="#804040"] 21 [color="#804040"]except URLError, e : [color="#804040"] 22 [color="#804040"]if hasattr( e, '[color="#ff00ff"]reason' ) : [color="#804040"] 23 [color="#804040"]print "[color="#ff00ff"]Failed to reach server: ", e.reason [color="#804040"] 24 [color="#804040"]elif hasattr( e, '[color="#ff00ff"]code' ) : [color="#804040"] 25 [color="#804040"]print "[color="#ff00ff"]Can't fulfill the requset: ",e.code [color="#804040"] 26 [color="#804040"]else : [color="#804040"] 27 [color="#804040"]pass [color="#0000ff"]#print "Requset is successful\n" [color="#804040"] 28 [color="#804040"] 29 [color="#804040"]return res [color="#804040"] 30 [color="#804040"] 31 [color="#804040"]def [color="#008080"]getSubjectsURL( cmpPattern ) : [color="#804040"] 32 """[color="#ff00ff"]Get the url of subjects""" [color="#804040"] 33 [color="#804040"] 34 board_url = '[color="#ff00ff"]http://bbs.sjtu.edu.cn/bbstdoc' [color="#804040"] 35 [color="#804040"] 36 [color="#0000ff"]# baord [color="#804040"] 37 data = { '[color="#ff00ff"]board' : '[color="#ff00ff"]PPPerson' } [color="#804040"] 38 datas = urllib.urlencode( data ) [color="#804040"] 39 [color="#804040"] 40 [color="#0000ff"]# get a response [color="#804040"] 41 res = requestURL( board_url, datas ) [color="#804040"] 42 [color="#804040"] 43 [color="#0000ff"]# create a compiled regular expression [color="#804040"] 44 [color="#0000ff"]#cmpPattern = re.compile(r'') [color="#804040"] 45 [color="#804040"] 46 [color="#0000ff"]# get the url list of titles [color="#804040"] 47 subjectsURL = cmpPattern.findall( res.read() ) [color="#804040"] 48 [color="#804040"] 49 [color="#804040"]return subjectsURL [color="#804040"] 50 [color="#804040"] 51 [color="#804040"]def [color="#008080"]getImagesURL( url, cmpPattern ) : [color="#804040"] 52 """[color="#ff00ff"]Return the images url according to corresponding subject url""" [color="#804040"] 53 [color="#804040"] 54 [color="#0000ff"]# get the url of subject [color="#804040"] 55 subject_url = '[color="#ff00ff"]http://bbs.sjtu.edu.cn/bbstcon' [color="#804040"] 56 [color="#0000ff"]# get datas [color="#804040"] 57 datas = url [color="#804040"] 58 [color="#804040"] 59 [color="#0000ff"]# request and get a response [color="#804040"] 60 res = requestURL( subject_url, datas ) [color="#804040"] 61 [color="#804040"] 62 [color="#0000ff"]# create a compiled pattern to find urls of images [color="#804040"] 63 [color="#0000ff"]#cmpPattern = re.compile( r'', re.I ) [color="#804040"] 64 [color="#804040"] 65 [color="#0000ff"]# get the urls of images [color="#804040"] 66 imagesURL = cmpPattern.findall( res.read() ) [color="#804040"] 67 [color="#804040"] 68 [color="#0000ff"]# delete the repeated images [color="#804040"] 69 sets = set( imagesURL ) [color="#804040"] 70 imagesURL = [] [color="#804040"] 71 [color="#804040"]for item [color="#804040"]in sets : [color="#804040"] 72 imagesURL.append( item ) [color="#804040"] 73 [color="#804040"] 74 [color="#804040"]return imagesURL [color="#804040"] 75 [color="#804040"] 76 [color="#804040"]def [color="#008080"]downloadImage( imageURL, subID ) : [color="#804040"] 77 """[color="#ff00ff"]Download images""" [color="#804040"] 78 [color="#804040"] 79 [color="#0000ff"]# image url [color="#804040"] 80 image_url = '[color="#ff00ff"]http://bbs.sjtu.edu.cn' + imageURL [color="#804040"] 81 [color="#804040"] 82 [color="#0000ff"]# create the directory to store images [color="#804040"] 83 [color="#0000ff"]# if not os.path.exists( './download' ) : [color="#804040"] 84 [color="#804040"]try : [color="#804040"] 85 os.makedirs( '[color="#ff00ff"]./download/' + subID ) [color="#804040"] 86 [color="#804040"]except OSError : [color="#804040"] 87 [color="#804040"]pass [color="#804040"] 88 [color="#0000ff"]#print "Failed to create directories" [color="#804040"] 89 [color="#804040"] 90 [color="#804040"] 91 [color="#0000ff"]# get filename of image [color="#804040"] 92 filename = '[color="#ff00ff"]download/' + subID + '[color="#ff00ff"]/' + imageURL.split( '[color="#ff00ff"]/' )[-1] [color="#804040"] 93 [color="#804040"] 94 [color="#0000ff"]# clear the cache that may have been built up [color="#804040"] 95 [color="#0000ff"]# by previous calls to urlretrieve() [color="#804040"] 96 urllib.urlcleanup() [color="#804040"] 97 [color="#804040"] 98 [color="#0000ff"]# retrieve the image [color="#804040"] 99 [color="#804040"]try : [color="#804040"]100 urllib.urlretrieve( image_url, filename ) [color="#804040"]101 [color="#804040"]except ContentTooShortError : [color="#804040"]102 [color="#804040"]print "[color="#ff00ff"]The data available was less than that of expected" [color="#804040"]103 [color="#804040"]print "[color="#ff00ff"]Downloading file %s was interrupted" [color="#6a5acd"]\ [color="#804040"]104 % os.path.basename( filename ) [color="#804040"]105 [color="#804040"]else : [color="#804040"]106 [color="#0000ff"]# get the size of file [color="#804040"]107 size = os.path.getsize( filename ) / 1024 [color="#804040"]108 [color="#804040"]print "[color="#ff00ff"]>>>File %s (%s Kb) was done..." % ( filename, size ) [color="#804040"]109 [color="#804040"]110 [color="#804040"]111 [color="#804040"]if __name__ == '[color="#ff00ff"]__main__' : [color="#804040"]112 [color="#804040"]113 [color="#0000ff"]# create compiled regular expression pattern [color="#804040"]114 findSubjectsPattern = re.compile( [color="#6a5acd"]\ [color="#804040"]115 r'[color="#ff00ff"](\d+).*?', re.I | re.DOTALL ) [color="#804040"]116 findImagesPattern = re.compile( r'[color="#ff00ff"]', re.I ) [color="#804040"]117 [color="#804040"]118 [color="#0000ff"]# get subjects' url list [color="#804040"]119 subjectsList = getSubjectsURL( findSubjectsPattern ) [color="#804040"]120 [color="#804040"]121 [color="#804040"]print "[color="#ff00ff"]Downloading begins...[color="#6a5acd"]\n" [color="#804040"]122 [color="#804040"]123 filecount = 1 [color="#804040"]124 [color="#804040"]125 [color="#804040"]for i [color="#804040"]in range( len( subjectsList ) ) : [color="#804040"]126 [color="#0000ff"]# get images url list [color="#804040"]127 [color="#804040"]print "[color="#6a5acd"]\n[color="#ff00ff"]Subject %s begins..." % subjectsList[0] [color="#804040"]128 imagesList = getImagesURL( subjectsList[1], findImagesPattern ) [color="#804040"]129 [color="#0000ff"]# download all iamges [color="#804040"]130 [color="#804040"]for j [color="#804040"]in range( len(imagesList) ) : [color="#804040"]131 downloadImage( imagesList[j], subjectsList[0] ) [color="#804040"]132 filecount += 1 [color="#804040"]133 [color="#804040"]134 [color="#804040"]print "[color="#6a5acd"]\n[color="#ff00ff"]All downloads were done" [color="#804040"]135 [color="#804040"]print "[color="#ff00ff"]%d files were downloaded totally[color="#6a5acd"]\n" % filecount [color="#804040"]136 |