import requests, sqlite3, time
from os.path import exists
from os import remove,system
from bs4 import BeautifulSoup
from time import strftime
from threading import Thread
from datetime import datetime
def getmsg(url):
movies = []
res = requests.get(url)
res.encoding='gbk'
soup = BeautifulSoup(res.text, 'html.parser')
movie_list = soup.find('div', class_='co_content8').findAll('table', class_='tbspan')
for movie in movie_list:
movie_time = movie.find('font', color='#8F8C89').get_text()[3:22]
movie = movie.findAll('a') # 有两个a标签,一定要用findAll,用find只会找到第一个a标签
# print(movie)
movie_name = movie[1].get_text()
movie_url = "https://www.dytt8.net" + movie[1]['href']
movies.append([movie_time, movie_name, movie_url])
# print('{:<20s}{:<40s}{:<60s}'.format(movie_time, movie_name, movie_url))
return movies
def save_db(movie_time,movie_name,movie_url):
dbname = '电影天堂_' + strftime('%y-%m-%d', time.localtime()) + '.sqlite'
conn =sqlite3.connect(dbname)
cursor = conn.cursor()
cursor.execute("insert into dytt_movies values('%s','%s','%s')"%(movie_time,movie_name,movie_url))
conn.commit()
cursor.close()
conn.close()
def down_link_save(movie_time, movie_name, movie_url):
try:
down_link_res=requests.get(movie_url)
down_link_res.encoding='gbk'
down_link_soup = BeautifulSoup(down_link_res.text,'html.parser')
down_link=down_link_soup.find('div',class_='co_content8').find('td',style="WORD-WRAP: break-word").find('a')['href']
movie_url=down_link
# print('{:<20s}{:<40s}{:<60s}'.format(movie_time, movie_name, movie_url))
save_db(movie_time,movie_name,movie_url)
except:
print("{}获取链接失败".format(movie_name))
def show_results(dbname):
filename = '电影天堂_'+strftime('%y-%m-%d',time.localtime())+'.txt'
if exists(filename):remove(filename)
conn = sqlite3.connect(dbname)
cursor =conn.cursor()
cursor.execute("select * from dytt_movies order by movie_time desc ")
results = cursor.fetchall()
print("总共找到{}部电影!".format(len(results)))
cursor.close()
conn.close()
i = 0
for movie in results:
i =i + 1
with open(filename,'a',encoding='utf-8') as f:
f.write("{:<5s}{:<30s}{:<40s}{:<40s}
".format(str(i),movie[0],movie[1],movie[2]))
system(filename)
# 主函数,函数入口
if __name__ == '__main__':
start = datetime.now()
movies = []
dbname = '电影天堂_' + strftime('%y-%m-%d', time.localtime()) + '.sqlite'
if exists(dbname): remove(dbname)
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
cursor.execute("create table dytt_movies(movie_time varchar(40),movie_name varchar(40),movie_url varchar(60))")
conn.commit()
cursor.close()
conn.close()
for i in range(1, 21):
url = 'https://www.dytt8.net/html/gndy/china/list_4_{}.html'.format(i)
response = requests.get(url)
response.encoding = 'gbk'
print('collecting message from {:s}'.format(url))
movies = getmsg(url)
# for item in movies:
# print(item)
# 多线程的好处,是一个线程执行不下去了,不影响其他线程。如果只有一个线程来抓一页的这25条,如果中间出错了,就执行不下去了
threads=[]
for item in movies:
t = Thread(target=down_link_save,args=(item[0],item[1],item[2]))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
run_time = (datetime.now()-start).total_seconds()
print("共用时{}秒".format(run_time,end=' '))
show_results(dbname)