1 __author__ = 'minmin'
2 #coding:utf-8
3 import re,urllib,sgmllib
4
5 #根据当前的url获取html
6 def getHtml(url):
7 page = urllib.urlopen(url)
8 html = page.read()
9 page.close()
10 return html
11
12 #根据html获取想要的文章内容
13 def func(str):
14 result = re.findall(r"<p.*?>([^<>]*)</p>",getHtml(url),re.M)
15 artical =''
16
17 for j in result:
18 if len(j)<>0:
19 j = j.replace("<strong>"," ")
20 j = j.replace("</strong>"," ")
21 j = j.replace("<br>"," ")
22 j = j.replace(" "," ")
23 j = j.replace("“"," ")
24 j = j.replace("”"," ")
25 j = j.replace("·"," ")
26 artical = artical + j + '
'
27 return artical
28
29 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。
30 class URLPaser(sgmllib.SGMLParser):
31 def reset(self):
32 sgmllib.SGMLParser.reset(self)
33 self.urls = []
34
35 def start_a(self,attrs):
36 href = [v for k,v in attrs if k == 'href']
37 if href:
38 self.urls.extend(href)
39
40 IParser = URLPaser()
41 socket = urllib.urlopen("http://travel.sohu.com/lvyouxinwen.shtml")#打开这个网页
42
43 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中
44 IParser.feed(socket.read())#分析啦
45
46 reg = 'http://travel.sohu.com/2015.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配
47
48 pattern = re.compile(reg)
49 i = 0
50 url2 = []
51 for url in IParser.urls:#链接都存在urls里
52 if pattern.match(url):
53 if url not in url2:
54 url2.append(url)
55 print url
56 artical = func(url)
57 print artical
58 if len(artical)<>0:
59 i = i + 1
60 f = open("sougou/Travel/"+str(i) + '.txt','a+')
61 f.write(artical)
62 f.close()