1 __author__ = 'minmin'
2 #coding:utf-8
3 import re,urllib,sgmllib,os
4
5 #根据当前的url获取html
6 def getHtml(url):
7 page = urllib.urlopen(url)
8 html = page.read()
9 page.close()
10 return html
11
12 #根据html获取想要的文章内容
13 def func(str):
14 result= re.findall(r"<p>([^<>]*)</p>",getHtml(url),re.M)
15 artical =''
16 for j in result:
17 if len(j)<>0:
18 j = j.replace(" ","")
19 j = j.replace("<STRONG>"," ")#去掉<STRONG>,换成" "
20 j = j.replace("</STRONG>"," ")#去掉</STROGN>换成" "
21 temp = re.findall(r"Copyright.*?",j,re.M);
22 if temp == []:
23 artical = artical + j + '
'
24 return artical
25
26 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。
27 class URLPaser(sgmllib.SGMLParser):
28 def reset(self):
29 sgmllib.SGMLParser.reset(self)
30 self.urls = []
31
32 def start_a(self,attrs):
33 href = [v for k,v in attrs if k == 'href']
34 if href:
35 self.urls.extend(href)
36
37 IParser = URLPaser()
38 socket = urllib.urlopen("http://tech.sina.com.cn/it/")#打开这个网页
39
40 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中
41 IParser.feed(socket.read())#分析啦
42
43 reg = 'http://tech.sina.com.cn/it/.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配
44
45 pattern = re.compile(reg)
46 os.getcwd()#获得当前文件夹路径
47 os.path.sep#当前系统路径分隔符
48
49 #判断文件是否存在
50 if os.path.exists('news163_it')==False:
51 os.makedirs('news163_it')
52
53 i = 0
54 url2 = []
55 for url in IParser.urls:#链接都存在urls里
56 if pattern.match(url):
57 if url not in url2:
58 url2.append(url)
59 artical = func(url)
60 if len(artical)<>0:
61 print url
62 print artical
63 i = i + 1
64 f = open("news163_it/"+ str(i) + '.txt','a+')
65 f.write(artical)
66 f.close()