图片提取程序(python)
这个程序可以把html文件里的图片提取出来,重新生成一个按原顺序排列的只有图
片的html文件,相信对那些爱收集图片的人会有用的。本来可以把它写成直接在网上抓图片的,不过怕网络连接处理不好,因此只对本地文件操作。感兴趣的人
可以把htmldata模块抓下来看看,我放上资源中心。
写这个东西的时候那个url的quote问题把我挡了很久,url自动把特殊字符转成%xx的形式,常见的是空格%20,如果文件名里头有空格,在
html里头的url就会是xxx%20xxx的形式,有个文件刚好有个%在名字里,变成了%25,弄了一整天,以为是编码的问题,最后还是在
mailing list的人告诉我的,菜阿!
import urllib, htmldata, time,string,sys,shutil,re,os,stat
'''
this program is to extract jpg file of html,and
create a simple html to view the graphics.
the argvs must be absoul directory.
'''
def extractjpg(url,targetpath,size,action='0'):
print ''
print '----------------------------------------------------------'#for log format
foundtitle=0
foundcharset=0
filesize=0
newfile=['',]
jpgs=[]
ignore=[]
jpgpath=re.sub('.htm.*?$','_files',url)
newpath=targetpath+'/'+string.split(jpgpath,'/')[-1]
filesubpath=string.split(jpgpath,'/')[-1]
contents = urllib.urlopen(url).read()
for u in htmldata.tagextract(contents):
if (foundtitle==1):
newfile.append( '')
foundtitle=2
continue
if isinstance(u,tuple) and u[0]=='title':
foundtitle=1;
continue
if isinstance(u,tuple) and u[0]=='meta':
try:
string.index(u[1]['content'],'charset')
#print u[1]['content']
newfile.append('')
continue
except :
pass
if foundtitle==2 and foundcharset==1 :
newfile.append('')
break
#now find all jpgs
for u in htmldata.urlextract(contents, url):
if u.tag_name == 'img':
filename=urllib.unquote(u.url) #nuquote the url to real "path/file"
fields=string.split(filename,'.')
if fields[-1]=='jpg' or fields[-1]=='jpeg' :
realname=string.split(filename,'/')[-1]
# print realname
if os.path.exists(filename) and os.stat(filename)[stat.ST_SIZE]>size:
#print filename
jpgs.append(realname)
newfile.append('')
newfile.append('
---------------------------------------------------------
')
else :
ignore.append(filename)
newfile.append('')
#print the log message
print '@file||extracting from:: '+string.split(url,'/')[-1]
print '@info||extracted '+str(len(jpgs))+' image::'
print jpgs
if len(ignore)>0:
print '@wanning||ignore '+str(len(ignore))+' image::'
print ignore
if action=='0':
print "just test"+string.split(url,'/')[-1]
return
#write the htmlfile and copy the jpgs to dist dir
newhtmlname=targetpath+'/'+string.split(url,'/')[-1]
htmlfile=open(newhtmlname,'w',102400)
htmlfile.writelines(newfile)
htmlfile.close()
if not os.path.exists(newpath):
os.mkdir(newpath)
for items in jpgs:
shutil.copyfile(jpgpath+'/'+items,newpath+'/'+items)
#------------------------------------------------------------------------------
if __name__ == '__main__':
srcdir = sys.argv[1]
disdir=sys.argv[2]
test=sys.argv[3]
size=1024
targetfiles=[]
if not(os.path.isdir(srcdir)):
print 'source path is incorrect'+sys.argv[1]
sys.exit()
if not(os.path.isdir(disdir)):
print 'target path is incorrect'+sys.argv[1]
sys.exit()
files=os.listdir(srcdir)
for elem in files:
fullname=os.path.join(srcdir,elem)
if os.path.isfile(elem):
extname=string.split(elem,'.')[-1]
if extname=='htm' or extname=='html':
targetfiles.append(fullname)
i=1
for elem in targetfiles:
print '@process file '+str(i)
extractjpg(elem,disdir,size,test)
i=i+1