MP3下载:baidu top100
cjcse
|
1#
cjcse 发表于 2008-09-15 18:42
MP3下载:baidu top100
注:本人初学python,把自己尝试编写的代码贴出来供大家一起学习。高人如有改进办法,欢迎指正!
[Copy to clipboard] [ - ]
CODE:
#!/usr/bin/python
import urllib import urlparse import htmllib import formatter import string import os import sys import thread #import threading class Parser(htmllib.HTMLParser): #return a dictionary mapping anchor texts to lists of associated hyperlinks def __init__(self, verbose=0): self.anchors = {} f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose) def anchor_bgn(self, href, name, type): self.save_bgn() self.anchor = href def anchor_end(self): text = string.strip(self.save_end()) if self.anchor and text: self.anchors[text] = self.anchors.get(text, []) + [self.anchor] # 下载url指定的网络资源对象 def SaveFile(url, path): try: seps = url.split("/") size = len(seps) name = seps[size-1] #print url name = path + "\\" + name url = UncodeUrl(url) if url.find(".mp3") == -1: return 0; content = DownObjectByUrl(url) if len(content) < 1024*1024: return 0; if not os.path.exists(path): os.mkdir(path) i = 1 list = name.split(".") while os.path.exists(name): if len(list) == 2: name = list[0] + "_" + repr(i) + "." + list[1] else: name = list[0] + "_" + repr(i) i += 1 op = open(name, "wb") if not op: print url + "\t[Failed]" os.remove(name) return 0 op.write(content) op.close() print url + "\t[OK]" return 1 except: print url + "\t[Failed]" try: op.close() os.remove(name) except: return 0 return 0 def DownObjectByUrl(url): fp = urllib.urlopen(url) content = "" while 1: s = fp.read(8192) if not s: break content += s fp.close() return content def GetNextRankLinks(url): links = [] html = DownObjectByUrl(url) p = Parser() p.feed(html) p.close() cnt = 0 for k, v in p.anchors.items(): for item in v: links.append(item) return links #-------- 破解baidu对url的变形处理,从加密的url还原出正确的mp3链接 -------------- def N(S,P,Q): for R in range(S, P+1): K[R]=R+Q H[R+Q]=R def A(Q): P=len(Q) S="" for R in range(0, P): T=Q[R] if T >= 'A' and T <= 'Z' or T >= 'a' and T <= 'z' or T >= '0' and T <= '9': i = ord(Q[R]) U = H[i] - M if U < 0: U+=62 T = chr(K[U]) S+=T return S def DisUrl(): global K global H global M global F global L global J O="" E="" N(0,9,48) N(10,35,55) N(36,61,61) M=string.atoi(F)%26 if not M: M = 1 O=A(L) if L == J: E = O else: E = A(J) return E # 获取进行baidu mp3路径解密的F/L/J def GetBaiduFLJ(content): global F global L global J k = content.find("var F=") if k == -1: return 0 k += len("var F=") str = content[k:-1] k = str.find(",") if k == -1: return 0 F = str[:k] k = str.find("var I=") if k == -1: return 0 k += len("var l=") + 1 str2 = str[k:-1] k = str2.find("\"") if k == -1: return 0 L = str2[:k] k = str2.find("J=") if k == -1: return 0 k += len("J=") + 1 str3 = str2[k:-1] k = str3.find("\"") if k == -1: return 0 J = str3[:k] return 1 #print "F=", F #print "L=", L #print "J=", J #baidu mp3 url 链接变形元素 F = "" L = "" J = "" M = 0 K=[] H=[] for i in range(0, 127): K.append(0) for j in range(0, 127): H.append(0) # url decode,主要是处理中文进行unicode编码的问题 def UncodeUrl(url): url2 = "" b = 0 for c in url: i = ord(c) if b == 1: b = 0 url2 += "%" s = repr(hex(i)) url2 += s[-3] + s[-2] continue if i >= 0x80: url2 += "%" s = repr(hex(i)) url2 += s[-3] + s[-2] b = 1 elif c == ' ': url2 += "%20" else: url2 += c return url2 url1 = "http://list.mp3.baidu.com/list/newhits.html?top1" def DownMp3(item): links2 = GetNextRankLinks(item) for item2 in links2: item2 = UncodeUrl(item2) content = DownObjectByUrl(item2) if content == "": continue if GetBaiduFLJ(content) == 0: continue mp3url = DisUrl() if mp3url == "": continue #mp3url = UncodeUrl(mp3url) #if mp3url.find(".mp3") == -1: # continue if SaveFile(mp3url, "c:\\baidump3") == 1: break links = GetNextRankLinks(url1) for item in links: if string.find(item, "http://") != -1 and string.find(item, "word=") != -1: print item thread.start_new_thread(DownMp3, (item,)) while True: pass 特别感谢: yugliu |