学习交流,发布抓网页小说源代码(可包含图片),内详
xukong
|
1#
xukong 发表于 2007-11-21 01:58
学习交流,发布抓网页小说源代码(可包含图片),内详
不好意思,再这里再发一道...
想在工作生活中多用RUBY,平时喜欢看小说,发觉古今书屋的书更新比较快,就写了个RUBY小程序来抓古今书屋小说的相关网页,网页中可包含图片,这里发布的只是自己随便用的小程序,肯定有很多不足,有不满意的请自己修改,这里发布只是为了和大家交流下写RUBY的代码,因为我发觉网上发布的实用点的RUBY源代码太少了 调用的批处理文件内容比如 sbzl.bat 内容如下 ruby getbookimage.rb 随波逐流之神龙传奇 http://www.gjsww.com/Html/Book/17/621/List.html 就可以在执行getbookimg.rb的当前目录下建立文件夹并下载相关网页了 文件名为getbookimg.rb require 'uri' require 'iconv' require 'hpricot' require 'net/http' require "open-uri" require 'fileutils' ##require 'extensions/string' def get(loc, encoding=nil) uri = URI.parse(loc) msg = Net::HTTP.get(uri) #if (!encoding) return msg #end #return Iconv.conv('gbk', encoding, msg) end localdirname = "小说中文名" base = "包含小说总目录的网页地址" localdirname = ARGV[0] base = ARGV[1] dir_expand_path = File.expand_path(".") dir_book_path = dir_expand_path + "/" + localdirname dir_book_path_2 = dir_book_path + "/" fnmax = 0 Dir[dir_book_path_2 + "*.htm*"].sort.each { |x| rl2 = x.rindex('.') iv2 = x[dir_book_path_2.length..rl2-1] if ((iv2.to_i > 0) and (iv2.to_i > fnmax)) fnmax = iv2.to_i end } book_content_path = "" book_image_path = "" doc = "" html = "" host = "" webbookpath = "" webbookcontentimagepath = "" uselocalfiletodebug = false if (uselocalfiletodebug) base = "list.html" doc = Hpricot.parse(File.read(base)) else html = get("#{base}", 'gbk') ##puts html doc = Hpricot.parse(html) end m = %r<http://([^/]+)>.match(base) or raise ArgumentError, "cannot parse URI: #{url_str}" host = "http://" + m[1].strip path = m.post_match path = '/' if path.empty? puts "主机名 #{host}" puts "书籍所在网页路径 #{path}" rl = path.rindex('/') path = path[0..rl-1] puts "分析出下载书籍网页的本地路径 path #{path}" rl = base.rindex('/') webbookpath = base[0..rl-1] puts "发现下载书籍所在网页路径 webbookpath #{webbookpath}" (doc/"a").each do |php?name=link" onclick="tagshow(event)" class="t_tag">link| shref = link.attributes['href'] if (shref == nil) next end sind = shref.index('.html') if (sind == nil) next end #puts "发现书籍章节相对链结数据 #{link}" #link = "993170.html" link_v = link.attributes['href'] rl = link_v.rindex('.') iv = link_v[0..rl-1] #puts iv if ((iv.to_i < fnmax) and (iv.to_i != 0)) #puts "书籍章节相对链结数据 #{link} 已经被下载,跳过..." next end sind = (link.attributes['href']).index('/') if ((sind == nil) || (sind > 0)) webbookcontentimagepath = webbookpath + "/" + link.attributes['href'] book_content_path = dir_book_path + "/" + link.attributes['href'] else webbookcontentimagepath = webbookpath + link.attributes['href'] book_content_path = dir_book_path + link.attributes['href'] end puts "发现书籍章节相对链结数据 #{link}" #puts "分析后书籍章节内容下载路径 #{webbookcontentimagepath}" #puts "分析后书籍章节内容本地保存路径 #{book_content_path}" #puts "分析后书籍章节内容本地保存目录 #{File.dirname(book_content_path)}" FileUtils.makedirs(File.dirname(book_content_path)) #############3 if (uselocalfiletodebug) base2 = "993170.html" doc2 = Hpricot.parse(File.read(base)) else base2 = webbookcontentimagepath html2 = get("#{base2}", 'gbk') ##puts html2 doc2 = Hpricot.parse(html2) end (doc2/"img").each do |imagelink| imagehtmlpath = imagelink.attributes['src'] #imagelink.attributes['src'] = "." + imagehtmlpath if (imagehtmlpath == nil) next end sind2 = imagehtmlpath.index('.gif') if (sind2 == nil) next end #puts "在书籍章节内容中 发现图像链结数据 imagelink #{imagelink}" sind2 = (imagelink.attributes['src']).index('/') if ((sind2 == nil) || (sind2 > 0)) webbookcontentimagepath = host + "/" + imagelink.attributes['src'] book_image_path = dir_book_path + "/" + imagelink.attributes['src'] else webbookcontentimagepath = host + imagelink.attributes['src'] book_image_path = dir_book_path + imagelink.attributes['src'] end#if sind2 == nill end puts "分析后图像最终下载链接 webbookcontentimagepath #{webbookcontentimagepath}" #puts "分析后图像最终本地保存路径 book_image_path #{book_image_path}" #puts "分析后图像最终本地保存目录 #{File.dirname(book_image_path)}" FileUtils.makedirs(File.dirname(book_image_path)) needrewritedata = false data=open(webbookcontentimagepath.strip){|f| if ((""+f.base_uri.to_s).eql?(webbookcontentimagepath))#如果返回的URL地址与传入的地址相等,说明文件存在可以准备下载 #puts "如果返回的URL地址与传入的地址相等,说明文件在网上有可以准备下载" #puts f.content_type if (File.exist?(book_image_path))#如果本地文件存在 #puts "如果本地文件存在1#{File.size?(book_image_path)}" #puts "如果本地文件存在2#{f.meta['content-length']}" if ((File.size?(book_image_path)).to_i != (f.meta['content-length']).to_i) #但是2者不相等,就需要重新下载了 puts "文件网上被更新,2者不相等,需要重新下载" #f.read needrewritedata = true else puts "图像文件下载后网上没有被更新,不用重新下载" end else#如果本地文件不存在,那么直接下载 #puts "如果本地文件不存在,那么直接下载" #f.read needrewritedata = true end end } if (needrewritedata) data=open(webbookcontentimagepath){|f| f.read } open(book_image_path,"wb"){|f|f.write(data)} end #needrewritedate end end#do img end (doc2/"img").each do |imagelink| imagehtmlpath = imagelink.attributes['src'] imagelink.attributes['src'] = "." + imagehtmlpath end begin outputfile = book_content_path f = open(outputfile, 'wb') htmlbook = ""#get("#{webbookcontentimagepath}", 'gbk') f.puts doc2 ensure f.close # ... and this always happens. end end |