爬取51job招聘信息(一)
目标,将网页上的内容爬取下来,并实现翻页,存储为csv。
import os from concurrent.futures.thread import ThreadPoolExecutor from threading import Thread import requests from re import findall from json import loads import time import pymysql from multiprocessing import Queue
import csv
# 获取每页的内容,定义一个函数 def get_one_page(page, city_code='000000'): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36' } url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' response = requests.get(url, headers=headers) if response.status_code == 200: json_data = findall(r'window.__SEARCH_RESULT__s*=s*({.+?})</script>', response.text)[0] return loads(json_data)['engine_search_result'] else: print('请求失败!')
# 需要多少页! start_page=1 ts=[] for i in range(10): result = get_one_page(start_page) if not result: print('没有更多数据') break ts.append(result) start_page += 1
#data_1 = get_one_page(1) #尝试保存一页的内容 data_1=[] # 创建空列表,用于存储多页 for i in range(len(ts)): for j in range(50):#一页50条 data_1.append(ts[i][j])
# 我需要存储的信息 jobs = [] for job in data_1: job_info = [job.get('job_name'), job.get('providesalary_text'), job.get('company_name'), job.get('companytype_text'), job.get('workarea_text'), '-'.join(job.get('attribute_text', ['-', '-', '-', '-', '-'])), job.get('jobwelf') ] jobs.append(job_info)
name=['job_name','providesalary_text','company_name','companytype_text','workarea_tex','attribute_text','jobwelf'] test=pd.DataFrame(columns=name,data=jobs) test.to_csv("testcsv.csv") # 保存为csv格式
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 job_name 500 non-null object 1 providesalary_text 500 non-null object 2 company_name 500 non-null object 3 companytype_text 500 non-null object 4 workarea_tex 500 non-null object 5 attribute_text 500 non-null object 6 jobwelf 500 non-null object dtypes: object(7) memory usage: 27.5+ KB
重要参考:https://gitee.com/wenhaha8/job51_analysis