# coding=utf-8 import urllib
import re
import sys
def mdcode( str ):
for c in ( 'utf-8','gbk', 'gb2312'):
try:
return str.decode(c).encode( 'gbk' )
except:
pass
return 'unknown'
url = 'http://www.google.cn/music/topiclisting?q=top100_north_south_line&cat=song'
filename='c:\\tmp\\url.txt'
wname='c:\\tmp\\out.txt'
regx='下载.*window.*http.*\\\\x26resnum'#\x26resnum很奇怪,明明看到的是一个‘\’可是匹配不出来,好像是有两个‘\\’
reg='http.*'
list =[]
result=[]
html=urllib.urlopen(url).read(); #下载网页
file=open(filename,'w')
file.write(html)
file.close()
file=open(filename,'r')
lines=file.readlines()
reobj=re.compile(regx)
reo=re.compile(reg)
for line in lines:
for match in reobj.finditer(line):
list.append(urllib.unquote(mdcode(match.group()))) #匹配地址,并转码
for s in list:
result.append(s[:-10]) #截去\x26resnum部分
list=[]
for r in result:
for match in reo.finditer(r):
list.append(match.group()) #匹配最后地址
file=open(wname,'w')
for r in list:
file.write(r+"\n")
file.close()