备份百度空间Blog的Python程序

linxh

UID: 23238
帖子: 122
积分: 280
在线时间: 20 小时

1^# linxh 发表于 2006-11-07 16:04

备份百度空间Blog的Python程序

#!/usr/bin/python
#BaiDu Blog Backuper v2
import urllib
import string
import re
def Save2File(url,fn):
print "Retrieving: ",url;
print "Save as: ",fn
try:
      URLFile=urllib.urlopen(url)
except IOError:
      print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
else:
      HTMLText=URLFile.read()
      URLFile.close()
      flist=fn.split("/")
      fn=string.join(flist)
      flist=fn.split("\\")
      fn=string.join(flist)
      flist=fn.split(":")
      fn=string.join(flist)
      flist=fn.split("*")
      fn=string.join(flist)
      flist=fn.split("?")
      fn=string.join(flist)
      flist=fn.split("\"")
      fn=string.join(flist)
      flist=fn.split(")
      fn=string.join(flist)
      flist=fn.split(">")
      fn=string.join(flist)
      HTMLFile=open(fn,"w")
      HTMLFile.write(HTMLText)
      HTMLFile.close()
def GetContent (url):
try:
      URLFile=urllib.urlopen(url)
except IOError:
      print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
else:
      HTMLText=URLFile.read()
      URLFile.close()
      return HTMLText

if(__name__=="__main__"):
list_base="http://hi.baidu.com/linxhchina/blog/index/"
artical_base="http://hi.baidu.com/"
rexp=re.compile(r'(.*?)')
queue=[];
cond=True;
i=0;
while cond:
      cond=False
      list_url="%s%d" %(list_base,i)
      i=i+1
      #print list_url
      content=GetContent(list_url)
      #print content
      lines=content.split("\n")
      for line in lines:
         #print line
         a=rexp.search(line)
         if(a):
            cond=True
            queue.append(a.groups())
print ""
print ""
print "Baidu Blog List"
print ""
print ""
print ""
for q in queue:
      artical_url="%s%s" %(artical_base,q[0])
      fname=q[1]+".html"
      print "%s
" %(artical_url,q[1])
      #Save2File(artical_url,fname)
print ""
print ""
print ""