51job履历:casper抓取对应url的简历处理的html
对于我们抓取简历页面的html含有大量多余的html,比如特殊标签,该如何处理呢
var fs = require('fs');
var utils = require('utils');
var standalone = false;
if(typeof(casper) == 'undefined') {
standalone = true;
var casper = require('casper').create();
}
// removing default options passed by the Python executable
casper.cli.drop("cli");
casper.cli.drop("casper-path");
var script = casper.cli.raw.args[0];
var thidparth = '';
if(standalone) {
thirdpath = fs.dirname(fs.dirname(fs.absolute(script)));
}
else {
// it's spooky/lib/bootstrap.js if we invoke within spooky
thirdpath = fs.dirname(fs.absolute(fs.dirname(fs.absolute(script)) + "/../../../../../cloud/daemon/thirdtools/"));
}
// load node underscore library - kind of hack because casper cannot use relative path to load js
var scriptpath = fs.workingDirectory;
fs.changeWorkingDirectory(thirdpath+"/node_modules/underscore");
var us = require(thirdpath+"/node_modules/underscore");
fs.changeWorkingDirectory(scriptpath);
var config = JSON.parse(fs.read(thirdpath + "/../../config/config.json"));
var common = require(thirdpath+'/51job/common.js');
if ((typeof(options) == 'undefined') && Object.keys(casper.cli.options).length === 0) {
casper.echo("Usage: --cookie=<cookie in string format>");
if(standalone) casper.exit(1);
}
// don't use var options here as it will undefine pass-in options variable
options = (typeof(options) == 'undefined') ? casper.cli.options : options;
var cookie = options["cookie"];
var companyid = options["companyid"];
var resumecodefile = options["resumecode"];
var resumecode = [];
var resumeid = [];
var resumeupdatetime = [];
var allresumeids = [];
// read resume code from file if file exists
if(fs.isFile(resumecodefile)) {
resumecode = JSON.parse(fs.read(resumecodefile));
fs.remove(resumecodefile);
}
function fetchAllResumeInfo(){
casper.waitForSelector('#pagerBottom_nextButton', function() {
var nextbutton = this.getElementInfo('#pagerBottom_nextButton');
if (nextbutton.attributes && !nextbutton.attributes.disabled){
var hasnextpage = true;
}else {
var hasnextpage = false;
}
var resumeids = this.getElementsAttribute('#chkBox','value');
if(resumeids && resumeids.length > 0) {
allresumeids = allresumeids.concat(resumeids);
//# only fetch resumes when the resumecode is not in our db
resumeids.forEach(function(r) {
if(!resumecode[r]){
resumeid.push(r);
}
})
}
var resume_html = '', resume_list_html=[];
try {
var td9 = this.getElementInfo('td.inbox_td9');
if(td9) {
resume_html = td9.text;
resume_list_html = resume_html.split(/(\t|\s)/);
}
} catch(e) {}
var resume_update_time = [], k = 0;
resume_list_html.forEach(function(a){
var re = /^(\d+)-(\d{1,2})-(\d{1,2})$/;
if (re.test(a.substring(0,10))) {
resume_update_time[k] = a.substring(0,10);
k++;
}
});
resumeupdatetime = resumeupdatetime.concat(resume_update_time);
if(hasnextpage){
this.click('#pagerBottom_nextButton');
casper.then(function(){
fetchAllResumeInfo();
});
}else{
var m = utils.unique(resumeid),
n = 0,
limit = 10,
mlength = m.length;
var flag = 1;
//# BAK
var fetchresume = function(x, callback) {
casper.thenOpen('http://' + domain + '/Candidate/ResumeViewFolder.aspx?hidSeqID=' + x + '&hidFolder=BAK' ).then(function(){
var current_url = this.getCurrentUrl();
// login session is not valid anymore
if(/(.*?)MainLogin.aspx/g.exec(current_url)){
this.echo('actionfatal:登录Session被中断,导入未完成!');
if(standalone) this.exit(1);
return;
}
var resultinfo = {};
// seems good, fetch page info, only fetch resume info here
casper.waitForSelector('#divResume', function() {
casper.evaluate(function() {
__utils__.removeElementsByXPath('//*[@class="rs_tab3"]');
__utils__.removeElementsByXPath('//*[@id="MenuContent"]');
__utils__.removeElementsByXPath('//*[@rel="stylesheet"]');
});
//var resumehtml = this.getPageContent();
var head = this.getElementInfo('head').tag.replace(/\t|\n/g,'').replace(/<script[^>]*?>.*?<\/script>/g,'').replace(/<style[^>]*?>.*?<\/style>/g,'').replace(/<title[^>]*?>.*?<\/title>/g,'');
var resumehtml = "<!doctype html>\n<html lang=\"en\">\n";
resumehtml += head;
resumehtml += '<body>';
resumehtml += this.getElementInfo('#divResume').html.replace(/\t|\n/g,'');
resumehtml += '</body>';
resumehtml += "</html>\n";
//# hide 51job head&chart
if(/<div id=\"divHead\">/.test(resumehtml))
resumehtml = resumehtml.replace('<div id=\"divHead\">', '<div id=\"divHead\" style=\"display:none\">');
if(/<div id=\"divChart\">/.test(resumehtml))
resumehtml = resumehtml.replace('<div id=\"divChart\">', '<div id=\"divChart\" style=\"display:none\">');
//# fetch flag
var k = allresumeids.indexOf(x);
if(resumeupdatetime[k]){
resumehtml = resumehtml + '<div id=\"__ppp_update_time\" style=\"display:none\">'+resumeupdatetime[k]+'<\/div>';
}
resumehtml = resumehtml + '<div id=\"__ppp_resume_fetch__\" style=\"display:none\">';
var position_match = /应聘职位:<span class=\"blue\">(.*?)<\/span>/g.exec(resumehtml);
//# attachment
attachment_re = /javascript:ShowAttach\((.*?),(.*?),(.*?)\)/g;
attachment_match = attachment_re.exec(resumehtml);
attachment_url = [];
attachment_path = '';
var hideuserid = this.getElementsAttribute('#hidUserID','value');
if(attachment_match){
var photodata = this.captureBase64("PNG", "td[valign=\"top\"] table td[valign=\"middle\"] a img");
if (photodata) {
//# the first one
resumehtml = resumehtml.replace("../Candidate/ReadAttach.aspx?UserID=" + hideuserid,"data:image/png;base64," + photodata);
}
}
while(attachment_match){
if (attachment_match[1].length > 2 && attachment_match[2].length > 2){
attachment_url.push('http://' + domain + '/Candidate/ReadAttach.aspx?AttachID='+attachment_match[2]+'&UserID='+ attachment_match[1]);
}
attachment_match = attachment_re.exec(resumehtml);
}
var destfolder = thirdpath + '/tmp/'+ companyid + "/" + x;
if(attachment_url.length > 0){
fs.makeDirectory(thirdpath + '/tmp/');
fs.makeDirectory(thirdpath + '/tmp/' + companyid);
fs.makeDirectory(destfolder);
attachment_path = destfolder;
attachment_url.forEach(function(url){
var attachment_subject = companyid + x + new Date().getTime() + '.png';
attachment_filename = destfolder + "/" + attachment_subject;
casper.download(url, attachment_filename);
resumehtml = resumehtml.replace(/src=\"..\/Candidate\/(.*?)"/,'src="/images/blank.png"');
});
}
//make the photo can't click
resumehtml = resumehtml.replace(/<a href=\"javascript:ShowAttach(.*?)>/g,'');
var resultinfo = {};
resultinfo.resumeposition = position_match ? position_match[1] : '无职位(系统)';
resultinfo.resumename = this.getTitle();
resultinfo.resumehtml = resumehtml;
resultinfo.resumecode = x;
resultinfo.resumeattachment = attachment_path;
this.echo("resumedata:" + JSON.stringify(resultinfo));
if(callback) callback();
}, function timedout() {
if(callback) callback('timedout');
}, 10000);
});
}
var fetchnext = function() {
if(m.length <= 0) return;
startfetch();
}
var startfetch = function(items) {
// if items is not given, just start one
if(!items) items = m.splice(0, 1);
if(items.length>0){
items.forEach(function(x) {
fetchresume(x, function(error) {
if(!error) {
n++;
casper.echo("percentage:" + (mlength>0 ? n*100/mlength : 100));
}
else {
// one failed, exclude it from final actionfinished
mlength--;
}
// any time there is one finished, we start a new one, so to maintain the max number of concurrent fetching to limit
fetchnext();
});
});
}
}
// at beginning, we launch max limit fetching
startfetch(m.splice(0, limit));
// wait for finished
casper.waitFor(function check() {
return n >= mlength ? true : false;
}, function check() {
this.echo("actionfinished:" + n);
}, function timedout() {}, 1800000);
}
});
}
if(!cookie) {
casper.echo("Usage: --cookie=<cookie in string format>");
if(standalone) casper.exit(1);
}
else {
try {
phantom.cookies = JSON.parse(cookie);
var domain = common.getCookieDomain(phantom.cookies);
// we need to try to see if given cookie is valid, otherwise we cannot continue
casper.start('http://' + domain + '/Inbox/CompanyHRNavigator.aspx').then(function(){
// get current url
var current_url = this.getCurrentUrl();
// login fail
if(/(.*?)MainLogin.aspx/g.exec(current_url)){
this.echo('loginerr');
if(standalone) this.exit(1);
return;
}
// seems good, fetch resume page
fetchAllResumeInfo();
});
}
catch (e) {
// login cookie not valid, need to re-login again
casper.echo("loginerr");
if(standalone) casper.exit(1)
}
casper.run();
}