#coding=utf-8
import os
import re
import time
import requests
import json
from pymongo import MongoClient
import traceback
import urlparse
import urllib
import urllib2
import hashlib
import chardet
import random
import xlwt
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#导出数据
def export_mongo_data():
file = xlwt.Workbook(encoding = 'utf-8')
sheet_name = u'name'
round = 1
while True:
result = db_client.xxxxxx.xxx.find({'xxx':{'$exists':False}}).limit(2000)
if round > 1:
sheet_name_temp = sheet_name + str(round)
else:
sheet_name_temp = sheet_name
if result.count():
table = file.add_sheet(sheet_name_temp)
table.write(0,0,u'id')
table.write(0,1,u'url')
table.write(0,2,u'name')
row_index = 1
for curr_res in result:
url = curr_res['url']
if url:
try:
_id = curr_res['_id']
zzz = curr_res['zzz']
paper_name = zzz + _id + ".html"
#print paper_name
table.write(row_index,0,curr_res['_id'])
table.write(row_index,1,curr_res['url'])
table.write(row_index,2,paper_name)
db_client.xxx.xxx.update({'_id':curr_res['_id']},{'$set':{'xxx':True}})
print curr_res['_id']
except Exception as e:
db_client.crawler_zuowen.gaosanW_byzhinengyuejuan.update({'_id':curr_res['_id']},{'$set':{'xxx':False}})
continue
row_index += 1
else:
db_client.crawler_zuowen.gaosanW_byzhinengyuejuan.update({'_id':curr_res['_id']},{'$set':{'zzz':False}})
round += 1
else:
break
file.save(u'name.xls')
if __name__ == '__main__':
db_client = MongoClient('xxx.xxx.xx.xxxx',27017)
export_mongo_data()
db_client.close()