图片提取程序(python)

vinge

UID: 8614
帖子: 71
积分: 163
在线时间: 7 小时

1^# vinge 发表于 2006-12-16 21:12

图片提取程序(python)

这个程序可以把html文件里的图片提取出来，重新生成一个按原顺序排列的只有图
片的html文件，相信对那些爱收集图片的人会有用的。本来可以把它写成直接在网上抓图片的，不过怕网络连接处理不好，因此只对本地文件操作。感兴趣的人
可以把htmldata模块抓下来看看，我放上资源中心。
写这个东西的时候那个url的quote问题把我挡了很久，url自动把特殊字符转成%xx的形式，常见的是空格%20,如果文件名里头有空格，在
html里头的url就会是xxx%20xxx的形式，有个文件刚好有个％在名字里，变成了%25，弄了一整天，以为是编码的问题，最后还是在
mailing list的人告诉我的,菜阿！

import urllib, htmldata, time,string,sys,shutil,re,os,stat
'''
this program is to extract jpg file of html,and
create a simple html to view the graphics.
the argvs must be absoul directory.
'''
def extractjpg(url,targetpath,size,action='0'):
print ''
print '----------------------------------------------------------'#for log format
foundtitle=0
foundcharset=0
filesize=0
newfile=['',]
jpgs=[]
ignore=[]
jpgpath=re.sub('.htm.*?$','_files',url)
newpath=targetpath+'/'+string.split(jpgpath,'/')[-1]
filesubpath=string.split(jpgpath,'/')[-1]
contents = urllib.urlopen(url).read()
for u in htmldata.tagextract(contents):
      if (foundtitle==1):
         newfile.append( '')
      foundtitle=2
      continue
if isinstance(u,tuple) and  u[0]=='title':
      foundtitle=1;
      continue
if isinstance(u,tuple) and  u[0]=='meta':
      try:
         string.index(u[1]['content'],'charset')
      #print u[1]['content']
      newfile.append('')
      continue
         except :
      pass
if foundtitle==2 and foundcharset==1 :
      newfile.append('')
      break
#now find all jpgs
for u in htmldata.urlextract(contents, url):
   if u.tag_name == 'img':

   filename=urllib.unquote(u.url) #nuquote the url to real "path/file"
         fields=string.split(filename,'.')
      if fields[-1]=='jpg' or fields[-1]=='jpeg' :
         realname=string.split(filename,'/')[-1]
#          print realname
         if os.path.exists(filename) and os.stat(filename)[stat.ST_SIZE]>size:
         #print filename
            jpgs.append(realname)
            newfile.append('')
            newfile.append('
---------------------------------------------------------
')
         else :
         ignore.append(filename)
newfile.append('')
#print the log message
print '@file||extracting from:: '+string.split(url,'/')[-1]
print '@info||extracted '+str(len(jpgs))+' image::'
print jpgs
if len(ignore)>0:
   print '@wanning||ignore  '+str(len(ignore))+' image::'
   print ignore
if action=='0':
   print "just test"+string.split(url,'/')[-1]
   return
#write the htmlfile and copy the jpgs to dist dir
newhtmlname=targetpath+'/'+string.split(url,'/')[-1]
htmlfile=open(newhtmlname,'w',102400)
htmlfile.writelines(newfile)
htmlfile.close()

if not os.path.exists(newpath):
   os.mkdir(newpath)
for items in jpgs:
   shutil.copyfile(jpgpath+'/'+items,newpath+'/'+items)
#------------------------------------------------------------------------------
if __name__ == '__main__':
srcdir = sys.argv[1]
disdir=sys.argv[2]
test=sys.argv[3]
size=1024
targetfiles=[]
if not(os.path.isdir(srcdir)):
   print 'source path is incorrect'+sys.argv[1]
   sys.exit()
if not(os.path.isdir(disdir)):
   print 'target path is incorrect'+sys.argv[1]
   sys.exit()
files=os.listdir(srcdir)
for elem in files:
   fullname=os.path.join(srcdir,elem)
   if os.path.isfile(elem):
         extname=string.split(elem,'.')[-1]
         if extname=='htm' or extname=='html':
         targetfiles.append(fullname)
i=1
for elem in targetfiles:
   print '@process file '+str(i)
   extractjpg(elem,disdir,size,test)
   i=i+1