casperjs 解析51job履历
casperjs抓取对应的url的简历,解析获取的html,进行获取匹配数据
'use strict';
var cheerio = require('cheerio');
var us = require('underscore');
var stringsim = require("stringsim");
var resumeutil = require("resumeutil");
// Expose to the world
var parse51Job = module.exports = function(subject, html, callback) {
return new Parser51Job(subject, html, callback);
};
function Parser51Job(subject, html, callback) {
return this.parse(subject, html, callback);
}
Parser51Job.prototype.parse = function(subject, html, callback) {
if(!html)
return null;
var data = this.parse_v3(html, callback);
this.parseSubject(data, subject);
return this.sanityCheck(data);
}
Parser51Job.prototype.sanityCheck = function(data) {
if(!data)
return data;
// we do sanity check on data before return
if(data.mobile)
data.mobile = resumeutil.convertUnicodeNumber(data.mobile);
if(data.email)
data.email = resumeutil.getEmail(data.email);
// convert 三年以上工作经验 to 3年以上工作经验
if(data.exp)
data.exp = resumeutil.convertChineseNumber(data.exp);
return data;
}
Parser51Job.prototype.parseSubject = function(data, subject) {
if(!data)
return;
data.job_title = null;
if(subject) {
// (51job.com)申请贵公司安卓开发工程师(南京)-王茜(wang xi)
// (51job)转发简历 高级php开发工程师(南京-建邺区)
if(/\(.*\)转发简历/.test(subject)) {
subject = subject.replace(/\t|\n/g,'').replace(/^(.*)转发简历/, '').replace(/\)/, '');
var list = subject.split(/\(/);
if(list.length > 0){
data.job_title = list[0] ? list[0].trim() : null;
var location = list[1] ? list[1].split('-') : [];
data.jobcity = location[0] ? location[0].trim() : '';
data.jobcitydist = location[1] ? location[1].trim() : '';
}
}else if (/\(.*\)申请贵公司/.test(subject)){
subject = subject.replace(/\t|\n/g,'').replace(/^(.*)申请贵公司/, '');
var list = subject.split(/-/);
if(list.length > 1)
data.name = list.pop().trim();
var title = list.join('-');
var m = null;
if(m = title.match(/(.*)((.*?))$/)) {
data.job_title = m[1].trim();
var location = m[2].split('-');
data.jobcity = location[0].trim();
data.jobcitydist = location[1] ? location[1].trim() : '';
}
else
data.job_title = title.trim();
}
}
}
// v3 is third version of parsing
Parser51Job.prototype.parse_v3 = function(html, callback) {
var self = this;
var m, parse_data = {};
var html = html.replace(/\t|\n/g,'').replace(/ /g, ' ').replace(/\s+/g, ' ').replace(/:/g, ':').replace(/= ("|')/g, "=$1");
var $ = cheerio.load(html, {decodeEntities: true});
parse_data.update_time = $('#__ppp_update_time').text().trim();
// we use relative-to-key-word parsing method to find relative information
parse_data.source = '51job';
// before we processing the html, let's see whether it contains a link to open up the resumes. Only do it when callback is given for overwrite
if(callback) {
var hrefs = [];
var links = $('a').filter(function(v) {
if($(this).attr('href') && $(this).attr('href').match(/\/Candidate\/ResumeViewSingle.aspx/)) {
hrefs.push($(this).attr('href'));
return true;
}
return false;
});
if(hrefs.length > 0) {
// this is forwarding resume html, we need to open up the link, and save the content to overwrite resumefile
callback(hrefs);
return null;
}
}
// 应聘职位
var found = $('*:contains("应聘职位:")').last();
if(found.length > 0) {
var located = false;
if(found.text().trim() == '应聘职位:') {
while(found.length > 0) {
var s = found.text().trim();
if(s != '应聘职位:') {
// only need to split ': '. the space should be followed by :
var values = us.compact(s.replace(/(.*)应聘职位:/, '应聘职位:').split(/:(?=\s*)/));
if(values.length <= 1) {
found = found.parent();
continue;
}
if(values.length >2) {
//we have some format such as: m: value n: value (resume17.html)
var n = values[1].trim().split(/\s+/);
if(n <=1)
parse_data.job_title2 = n.shift();
else {
n.pop();
parse_data.job_title2 = n.join(' ');
}
}
else
parse_data.job_title2 = values[1].trim();
break;
}
else {
found = found.parent();
}
}
}
else {
found.contents().each(function() {
var s = $(this).text().trim();
if(s == '应聘职位:') {
located = true;
}
else if(s.length > 0 && located) {
parse_data.job_title2 = s;
return false;
}
});
}
}
// 应聘公司:
found = $('*:contains("应聘公司:")').last();
if(found.length > 0) {
var located = false;
if(found.text().trim() == '应聘公司:') {
while(found.length > 0) {
var s = found.text().trim();
if(s != '应聘公司:') {
var values = us.compact(s.replace(/(.*)应聘公司:/, '应聘公司:').split(/:(?=\s*)/));
if(values.length <= 1) {
found = found.parent();
continue;
}
if(values.length >2) {
//we have some format such as: m: value n: value (resume17.html)
var n = values[1].trim().split(/\s+/);
if(n <=1)
parse_data.company_name = n.shift();
else {
n.pop();
parse_data.company_name = n.join(' ');
}
}
else
parse_data.company_name = values[1].trim();
break;
}
else {
found = found.parent();
}
}
}
else {
found.contents().each(function() {
var s = $(this).text().trim();
if(s == '应聘公司:') {
located = true;
}
else if(s.length > 0 && located) {
parse_data.company_name = s;
return false;
}
});
}
}
// 投递时间:
if(!parse_data.update_time) {
found = $('*:contains("投递时间:")').last();
if(found.length > 0) {
var located = false;
if(found.text().trim() == '投递时间:') {
while(found.length > 0) {
var s = found.text().trim();
if(s != '投递时间:') {
var values = us.compact(s.replace(/(.*)投递时间:/, '投递时间:').split(/:(?=\s*)/));
if(values.length <= 1) {
found = found.parent();
continue;
}
if(values.length >2) {
//we have some format such as: m: value n: value (resume17.html)
var n = values[1].trim().split(/\s+/);
if(n <=1)
parse_data.update_time = n.shift();
else {
n.pop();
parse_data.update_time = n.join(' ');
}
}
else
parse_data.update_time = values[1].trim();
break;
}
else {
found = found.parent();
}
}
}
else {
found.contents().each(function() {
var s = $(this).text().trim();
if(s == '投递时间:') {
located = true;
}
else if(s.length > 0 && located) {
parse_data.update_time = s;
return false;
}
});
}
}
}
// 工作地点:
// 高级项目经理(规划设计分院)(深圳-南山区)
// 初级软件测试工程师(南京-建邺区)
if(parse_data.job_title2 && parse_data.job_title2.length > 0) {
// use not include !(\(|\)|)|()
if(m = parse_data.job_title2.match(/[\((](((?!(\(|\)|)|()).)*)[)\)]$/)) {
var location = m[1].split('-');
parse_data.jobcity = location.shift().trim();
parse_data.jobcitydist = location.length > 0 ? location.shift().trim() : '';
parse_data.job_title2 = parse_data.job_title2.replace(m[0], '').trim();
}
}
// pre-fill some information
parse_data.highest_degree_level = '';
parse_data.highest_major_name = '';
parse_data.highest_school_name = '';
parse_data.latest_company_name = '';
parse_data.latest_job_spec = '';
parse_data.latest_job_title = '';
parse_data.latest_exp = '';
// basic information
var located = false;
found = $('*').filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
// check first posibility
if($(this).text().trim().match(/\|\s+(男|女|Male|Female)\s+/ig)) {
located = true;
return true;
}
// use first match to avoid reading user's input (resume18.html)
if(located)
return;
return $(this).text().trim().match(/^(男|女|Male|Female)\s+/ig);
}).last();
// found could be:
// 应届毕业生 | 男 | 21岁(1992年6月23日) | 未婚 | 175cm| 团员
// 一年以上工作经验 | 女 | 23岁(1990年6月22日) | 未婚 | 170cm| 群众
// 在读学生 | 女 | 15岁(1999年2月6日)
// 二年以上工作经验 | 男 | 24岁(1989年10月2日) | 未婚 | 175cm| 中共党员 | 身份证: 320626196607065213
// 男 | 30岁(1983年6月17日)
// > 10 years| Male | 34Years old(1979/3/24)
if(found.length > 0) {
var foundtext = found.text().trim();
var values = foundtext.split('|');
if(values.length <= 1)
values = values.shift().split(/\s+/);
else {
// add | before first item to split other tags when there is not work status
found.prepend('<span> | </span>');
foundtext = found.text().trim();
values = foundtext.split('|');
}
values = us.compact(values);
if(values.length > 0)
parse_data.exp = values.shift().trim();
// get rest
while(values.length > 0) {
var v = values.shift().trim();
if(m = v.match(/(男|女|Female|Male)/ig)) {
var sex = m[0].toLowerCase();
parse_data.sex = sex == 'female'? '女' : (sex == 'male' ? '男': sex);
}
if(m = v.match(/^(\d+)(岁|Years old)/i)) {
parse_data.age = m[1];
}
// two formats:
// 1966 年7月生
// 1966年7月6日
if(m = v.match(/\d+\s*年\d+月(\d+日|生)/g)) {
parse_data.birth = m[0].replace(/\s/, '').replace(/年|月/g,'-').replace(/日|生/g,'').replace(/\b(\w)\b/g, '0$1');
if(/\-$/.test(parse_data.birth))
parse_data.birth += '01';
continue;
}
if(m = v.match(/\d+\s*\/\d+\/\d+/g)) {
parse_data.birth = m[0].replace(/\s/, '').replace(/\//g,'-').replace(/\b(\w)\b/g, '0$1');
continue;
}
if(v.match(/婚$/g) || v.match(/单身/g) || v.match(/离异/g)) {
parse_data.marriage = v;
continue;
}
if(m = v.match(/^(\d+)cm/)) {
parse_data.height = m[1];
break;
}
}
if(values.length > 0) {
parse_data.socialparty = values.shift().trim();
}
// get name. The name is placed before basic information
// 流程状态:xxx 标签: 在读学生 | 女 | 22岁(1991年8月9日)(ID:309207915)居住地:南京-雨花台区户 口:扬州电 话:15295753639(手机)E-mail:15295753639@163.com
var p = found;
while(1) {
p = p.parent();
if(p.length <= 0)
break;
var text = p.text().trim();
var pos = text.indexOf(foundtext);
if(pos > 0) {
text = text.replace(/^流程状态:/g, '').replace(/^已转发/g, '');
pos = text.indexOf(foundtext);
var fronttext = text.substring(0, pos).trim();
var lasttext = text.substring(pos, text.length).trim();
// the name is ahead of foundtext, but we need to figure out (resume17.html)
// 1. if it's english(a-z), keep to reading back till we meet non-english; space is allowed
// 2. if it's chinese, till read we have at least three chinese chars; space is allowed
var values = us.compact(fronttext.split(/\s+(?![a-z]+))/));
var i = 0, onlyenglish=false;
while(1) {
if(i>= values.length)
break;
// remove special char or only numbers
if(/(:|^\d+$)/.test(values[values.length-1-i])) {
values.splice(values.length-1-i, 1);
continue;
}
else if(/^[a-z][a-z0-9]*$/i.test(values[values.length-1-i])) {
onlyenglish = true;
}
else {
onlyenglish = false;
}
i++;
// the name should not be up to 3 levels
if((i >=3 && !onlyenglish) || (onlyenglish && i>=5))
break;
}
// if only english, we need to combind together
if(onlyenglish)
parse_data.name = values.join(' ').trim();
else if (values.length > 2)
parse_data.name = values.pop();
else
parse_data.name = values.shift();
// remove some special chars -- or ―
if(parse_data.name)
parse_data.name = parse_data.name.replace(/[\-―](.*)/, '');
// look for ID directly
if(m = lasttext.match(/\(ID:(\d+)\)/)) {
parse_data.resumecode = m[1];
}
break;
}
}
// get next basic information
// layout:
// 居住地:北京-大兴区
// <td>居住地:</td>
// <td>xxx</td>
// <td>户 口:</td>
// <td>xxx</td>
// 电 话:<b>15510491669</b>(手机)
var found = $('*:contains("居住地:")', p).last();
if(found.length > 0) {
var location = found.text().trim();
if(location == "居住地:")
location = found.next().text();
location = location.replace(/居住地:/, '').split('-');
parse_data.city = location.shift().trim();
parse_data.citydist = location.length > 0 ? location.shift().trim() : '';
}
var found = $('*:contains("户 口:")', p).last();
if(found.length > 0) {
var residence = found.text().trim();
if(residence == "户 口:")
residence = found.next().text().trim();
parse_data.residence = residence.replace(/户 口:/, '');
}
var found = $('*:contains("电 话:")', p).last();
if(found.length > 0) {
var mobile = found.text().trim();
if(mobile == "电 话:")
mobile = found.next().text().trim();
parse_data.mobile = mobile.replace(/电 话:/, '').replace(/(手机)/, '').trim();
}
// english telephone number:
var found = $('*:contains("Telephone number:")', p).last();
if(found.length > 0) {
var mobile = found.text().trim();
if(mobile == "Telephone number:")
mobile = found.next().text().trim();
parse_data.mobile = mobile.replace(/Telephone number:/, '').replace(/\(MobilePhone\)/, '').trim();
}
var found = $('*:contains("E-mail:")', p).last();
if(found.length > 0) {
var email = found.text().trim();
if(email == "E-mail:")
email = found.next().text().trim();
parse_data.email = email.replace(/E-mail:/, '').trim();
}
// national ID
var found = $('*:contains("身份证:")', p).last();
if(found.length > 0) {
var nationid = found.text().trim();
if(nationid == "身份证:")
nationid = found.next().text().trim();
parse_data.nationid = nationid.replace(/身份证:/, '').trim();
}
}
else {
var first = 1;
found = $('*').filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
// we only need to obtain first basic information resume11.html has multiple basic information provided by user custom input
if(first && $(this).text().trim().match(/^(基本信息|Basic Info.)$/g)) {
first = 0;
return true;
}
return false;
}).last();
// possible layout:
// <td style="width:width:14%" class="weight100"> 姓 名: </td>
// <td style="width:190px" class="weight190"> xxx </td>
// <td style="width:width:14%" class="weight100"> 性 别: </td>
// <td style="width:190px" class="weight190"> 女 </td>
// <td rowspan="7" style="width:width:14%" class="weight100" valign="top">
// <td> 出生日期: </td> <td> 1990年7月30日 </td>
// <td> 居 地: </td> <td> 南京 </td> </tr>
// <td> 工作年限: </td> <td> 一年以上 </td>
// <td> 电子邮件: </td> <td> layla1990@qq.com </td>
// <td> 学 历: </td> <td> 本科 </td>
// <td> 专 业: </td> <td> 计算机科学与技术 </td>
// <td> 职 能: </td> <td> 活动策划 </td>
// <td> 行 业: </td> <td> 互联网/电子商务 </td>
// <td> 手机号码: </td> <td colspan="3"> 13951087924 </td>
// <td> 公司电话: </td> <td colspan="3"> 086- 029- 88371041 </td>
// <td> 关 键 词: </td> <td colspan="3"> 建筑设计 城市规划 机场总图 土地开发 </td>
if(found.length > 0) {
var keys = [['name', '姓 名:'],
['name', 'Name:'],
['sex', '性 别:'],
['sex', 'Gender:'],
['birth', '出生日期:'],
['birth', 'Date of Birth:'],
['location', '居 住 地:'],
['location', 'Residency:'],
['exp', '工作年限:'],
['exp', 'Yrs.of Experience::'],
['email', '电子邮件:'],
['email', 'Email:'],
['mobile', '手机号码:'],
['mobile', 'Mobile Phone:'],
['companyphone', '公司电话:'],
['highest_degree_level', '学 历:'],
['highest_degree_level', 'Degree:'],
['highest_major_name', '专 业:'],
['highest_major_name', 'Major:'],
['latest_job_pos', '职 能:'],
['latest_job_pos', 'Job Category:'],
['latest_job_spec', '行 业:'],
['latest_job_spec', 'Industry:'],
['keyword', '关 键 词:']];
var p = found.next();
keys.forEach(function(o) {
var k = o.shift(), v=o.shift();
var found = $('*:contains("'+v+'")', p).last();
if(found.length > 0) {
if(k == 'location') {
var location = found.next().text().split('-');
parse_data.city = location.shift().trim();
parse_data.citydist = location.length > 0 ? location.shift().trim() : '';
return;
}
else if(k == 'birth') {
parse_data.birth = found.next().text().trim().replace(/\//g, '-').replace(/年|月/g,'-').replace(/日/g,'').replace(/\b(\w)\b/g, '0$1');
return;
}
else
parse_data[k] = found.next().text().trim();
}
})
}
}
// national ID. This can be anywhere, so we have to search globally
var found = $('*:contains("身份证:")').last();
if(found.length > 0) {
var nationid = found.text().trim();
nationid = nationid.replace(/(.*)身份证:/, '').trim().match(/^\d+/);
if(nationid.length > 0)
parse_data.nationid = nationid.shift();
}
// latest_job_spec/highest degree level might be in 最近工作 and 最高学历 sections
if(!parse_data.latest_job_spec || !parse_data.highest_degree_level) {
// 最近工作
var found = $('*').filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
return $(this).text().trim().match(/^最近工作\s*\[/g);
}).last();
// format:
// <table><tr><td><b>最近工作</b></span><span><b>[ 2个月]</b></span></td></tr>
// <tr><td width="59">公 司:</td><td width="230">南京智风多媒体有限公司</td></tr>
// <tr><td>行 业:</td><td>计算机服务(系统、数据服务、维修)</td></tr>
// <tr><td>职 位:</td><td>软件测试实习生</td></tr></tbody>
// </table>
if(found.length > 0) {
var p = found;
while(1) {
var section = $('*:contains("公 司:")', p).last();
if(section.length > 0) {
// found the root node
parse_data.latest_company_name = section.next().text().trim();
break;
}
p = p.parent();
if(p.length <= 0)
break;
}
if(parse_data.latest_company_name) {
section = $('*:contains("行 业:")', p).last();
if(section.length > 0)
parse_data.latest_job_spec = section.next().text().trim();
section = $('*:contains("职 位:")', p).last();
if(section.length > 0)
parse_data.latest_job_title = section.next().text().trim();
}
}
// 最高学历
var found = $('*').filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
return $(this).text().trim().match(/^最高学历$/g);
}).last();
// format:
// <td><tbody><tr><td colspan="2"><span class="font14 blue"><b>最高学历</b></span></td></tr>
// <tr><td width="59">学 历:</td><td width="230">大专</td></tr>
// <tr><td>专 业:</td><td>模具设计与制造</td></tr>
// <tr><td height="22">学 校:</td><td>昆山登云科技职业学院</td></tr>
// </tbody></table></td>
if(found.length > 0) {
var p = found;
while(1) {
var section = $('*:contains("学 历:")', p).last();
if(section.length > 0) {
// found the root node
parse_data.highest_degree_level = section.next().text().trim();
break;
}
p = p.parent();
if(p.length <= 0)
break;
}
if(parse_data.highest_degree_level) {
section = $('*:contains("专 业:")', p).last();
if(section.length > 0)
parse_data.highest_major_name = section.next().text().trim();
section = $('*:contains("学 校:")', p).last();
if(section.length > 0)
parse_data.highest_school_name = section.next().text().trim();
}
}
}
this.getWorkexp_v3($, parse_data);
this.getEducation_v3($, parse_data);
return parse_data;
}
Parser51Job.prototype.getWorkexp_v3 = function($, parse_data) {
// get 工作经历
// 2013/02 -- 2013/09:江苏联盛科技有限公司 | 技术部 | java软件工程师 | IT服务(系统/数据/维护)/多领域经营 | 民营 | 规模:20-99人 | 2001-4000元/月 | asldkfj
// 2014.01 - 至今 333 销售代表 互联网/电子商务 | 企业性质:外商独资 工作描述: 333
//
var self = this;
var found2 = [];
var found = $('*').filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
if($(this).text().trim().match(/^(工作经验|职业经历|工作经历)[:]?$/g))
return true;
// otherwise search contents
var p = $(this).contents().filter(function() {
if($(this).text().trim().match(/^(工作经验|职业经历|工作经历)[:]?$/g))
return true;
}).last();
// use sub found2
if(p.length > 0)
found2 = p;
}).last();
if(found.length <= 0) {
if(found2.length <= 0)
return;
found = found2;
}
// find the section by finding its parent
var sectiontext = found.text().trim();
var p = found;
while(1) {
p = p.parent();
var text = p.text().trim();
if(text != sectiontext)
break;
}
var horizonlayout = false, topend = false;
var founds = [];
// if p.text is not starting with 工作经历, it means we are going up too far
// the section should be next node
var p = found;
while(1) {
if(p.next().length <= 0) {
if(p.parent().length <=0)
break;
p = p.parent();
continue;
}
var n = p.next();
if(n && n.text().trim().length > 0) {
p = n;
break;
}
// some layout has such format
// <html>
// <body>
// <table>...工作经历..</table>
// <table>...work exp 1</table>
// <table>...work exp 2</table>
// ...
// </body>
// </html>
//
// so we check if next item starting with date range
var insection = null;
founds = n.nextAll().filter(function() {
var v = $(this).text().trim();
// FIXME: somehow the pure text field won't be selected, so we have to select it specifically
var n = (this[0].next && this[0].next.type == 'text') ? this[0].next.data.trim() : '';
if((v.length <=0 && n.length <= 0) || insection === false)
return false;
v = v.replace(/(\d+)年(\d+)月/g, '$1/$2');
if(v.match(/^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)/g)) {
insection = true;
return true;
}
insection = false;
return false;
});
if(founds.length >0) {
p = n;
horizonlayout = true;
break;
}
// look for next parent
p = p.parent();
// if we the workexp is included in next parent. when workexp is deep under
// text mode, previous nextAll might not get it:
// <div align="left"><b>职业经历</b>
// <b> </b>
// <br>
// 2011.09-至今:西部机场集团规划发展部,机场规划与土地管理主管
// </div>
// <div align="left">2008.9-2011.09:深圳机场集团扩建工程指挥部,规划设计部,规划设计经理</div>
//
var text = p.text().trim();
var regexp = new RegExp("^" + sectiontext);
if(text.match(regexp)) {
topend = true;
// we reach the top of section, has to stop here
break;
}
}
// search multiple lines format:
// 2013/07 -- 2013/10:<company name>|
// 2013/07 -- 2013/10 ...
// some other horizontal layout:
// horizon layout is not coded as section, but as such:
// <workexp head 1>
// <basic inform 1>
// <basic inform 11>
// <workexp job text>
// <workexp head 2>
// <basic inform 2>
// <basic inform 22>
// <worexp job text>
//
// som other non-horizontal layout:
// <div align="left"><b>职业经历</b>
// <b> </b>
// <br>
// 2011.09-至今:西部机场集团规划发展部,机场规划与土地管理主管
// </div>
// <div align="left">2008.9-2011.09:深圳机场集团扩建工程指挥部,规划设计部,规划设计经理</div>
// FIXME: need to parse uncommon format
if(!horizonlayout) {
if(topend) {
// p itself
var regexp = new RegExp("^" + sectiontext);
var text = p.text().replace(regexp, '').trim();
if(text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)[:]$/g) ||
text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)\s+\S+/g)) {
var html = p.html().replace(sectiontext, '');
p.empty().html(html);
}
}
founds = $('*', p).filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
// text might be a from parent element which contains multiple experience lines,
// but we only need to keep the single element of having date range
if($(this).children().length > 0) return false;
var text = $(this).text().trim().replace(/(\d+)年(\d+)月/g, '$1/$2');
if(text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)[:]$/g)) {
if(horizonlayout && !$(this).parent().is(p))
return false;
if(!horizonlayout) {
if($(this).parent().is(p)) horizonlayout = true;
}
return true;
}
// save as above but with company together
if(text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)\s+\S+/g)) {
if(horizonlayout && !$(this).parent().is(p))
return false;
if(!horizonlayout) {
if($(this).parent().is(p)) horizonlayout = true;
}
return true;
}
});
}
// get multiple full line of work experience
// found is only the list of single text node. We need to track back to its parent node which contains full experience line text
var workexps = [];
if(horizonlayout) {
// when section is in horizontal format, we can easily combine texts
founds.each(function(i, el) {
var c = $(this),
n = null,
m = null,
pattern = null;
c.find('br').each(function() {
$(this).replaceWith('<span> | </span>');
});
// this is the node we need to obtain all text from next all elements
// c is full complete text in format as:
// <tr><td>2013 /12--2014 /2:南京智风多媒体有限公司(50-150人)<span style="color:#676767;"><b> [ 2个月] </b></span></td></tr>
// <tr><td width="22%" class="text_left">所属行业:</td><td width="78%" class="text">计算机服务(系统、数据服务、维修)</td></tr>
// <tr><td class="text_left"><b>软件测试</b></td><td class="text"><b>软件测试实习生</b></td></tr>
// <tr><td colspan="2" class="text_left">主要是测试教育软件,大学视频教学,数字化教学资源、多媒体产品<br>测试工具主要是公司研发的自主软件<br>管理软件是:OA小型公司管理系统</td></tr>
// <tr><td> 2008 /3--2012 /12:天泽信息产业股份有限公司(150-500人) [ 4年9个月]</td></tr>
// ...workexp 2
var f = function(n) {
var text = n.text().trim().replace(/(\d+)\s*[\.\/]/g, "$1/");
while(1) {
n = n.next();
// need to see if next element is in found list, if so, it's next workexp
// the next level should not reach more than 5 level down
var nt = n.length <= 0 ? '' : n.text().replace(/(\d+)年(\d+)月/g, '$1/$2').trim();
// end of section
if(n.length <= 0) {
workexps[workexps.length++]= text.replace(sectiontext, '').replace(/(\d+)年(\d+)月/g, '$1/$2').trim();
if(n.length <= 0)
break;
}
// new workexp ?
if(m=nt.match(/(^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})\s*(.*?)/g)) {
// it should match date start pattern
if(stringsim.distance(pattern, resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''))) == 0) {
workexps[workexps.length++]= text.replace(sectiontext, '').replace(/(\d+)年(\d+)月/g, '$1/$2').trim();
// good for next workexp
text = nt.replace(/(\d+)\s*[\.\/]/g, "$1/");
continue;
}
}
// replace br with delimiter
n.find('br').each(function() {
$(this).replaceWith('<span> | </span>');
});
n.children().each(function() {
var v = $(this).text().trim();
if(v.length > 0)
text = text + " | " + v;
})
}
};
$('*', c).each(function() {
if(m = $(this).html().replace(/(\d+)年(\d+)月/g, '$1/$2').trim().match(/(^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})\s*(.*?)/g)) {
if(!pattern) {
// remember first date range as pattern. The each following workexp should have same pattern
pattern = resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''));
}
else {
// the max text distance is 0: 1111/11--1111/11 => 1111/11--1111/11
if(stringsim.distance(pattern, resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''))) > 0) {
// most likely this is not a new workexp
return true;
}
}
var t = $(this).text().trim();
n = $(this);
while(1) {
if(n.parent().text().trim() != t) {
f(n);
break;
}
n = n.parent();
}
return false;
}
})
})
}
else {
// the next each is used to handle non-horizontal layout
founds.each(function(i, el) {
var text = $(this).text().trim();
var p = $(this);
while(1) {
p = p.parent();
var ntext = p.text().trim();
if(ntext != text) {
// replace br with delimiter
p.find('br').each(function() {
$(this).replaceWith('<span> | </span>');
})
// replace any tag with delimiter to correct build text
if(p.children().length > 0) {
$('*', p).each(function() {
var elem = this[0];
if($(this).children() <= 0) {
if($(this).text().length > 0)
$(this).html($(this).html() + " |");
else {
$(this).replaceWith(' | ');
}
}
})
}
workexps[workexps.length++]= p.text().replace(sectiontext, '').trim();
break;
}
}
});
}
parse_data.workexp = [];
us.unique(workexps).forEach(function(v) {
var w = self.parseWorkexp_v3(v);
if(w)
parse_data.workexp[parse_data.workexp.length++] = w;
});
this.updateWorkexp(parse_data);
}
Parser51Job.prototype.parseWorkexp_v3 = function(workexp) {
var m, tmp = {};
// get out known things if they are present
// some resumes have these
if(m=workexp.match(/企业性质:(.*?)\s+/)) {
tmp.job_comptype = m[1].trim().replace(/^\|/, '').trim();
workexp = workexp.replace(m[0], '');
}
if(m=workexp.match(/工作描述:(.*?)$/)) {
tmp.job_text = m[1].trim().replace(/^\|/, '').trim();
workexp = workexp.replace(m[0], '');
}
workexp = us.compact(workexp.trim().split('|'));
var exp = workexp.shift().trim();
if(m = exp.match(/(\d{4}\s*[/\.]\d{1,2}\s*[-]{1,2}\s*(.*?))[:\s]/)) {
var job_years = m[1].trim().replace('--', '-').split('-');
tmp.job_year_start = job_years.shift().trim().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');
tmp.job_year_end = job_years.shift().trim().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');
tmp.job_months = 0;
var now_job = false;
try {
var start_date = new Date(tmp.job_year_start);
if (/^\d/.test(tmp.job_year_end)){
var end_date = new Date(tmp.job_year_end);
}else{
if(now_job){
var end_date = start_date;
}else{
var end_date = new Date();
now_job = true;
}
}
var diff_months = (new Date(end_date - start_date))/1000/60/60/24/30;
tmp.job_months = diff_months > 0 ? Math.round(diff_months) : 0;
} catch(e) { }
// 南京智风多媒体有限公司(50-150人) [ 2个月]
var exp2 = exp.replace(m[0], '').replace(/\[(.*)?\]$/g, '');
// find out the company size
m = exp2.match(/[\((]([少于|\d+\-\d+](.*)?)[)\)]/);
if(m) {
exp2 = exp2.replace(m[0], '');
tmp.job_compsize = m[1].trim();
}
var exp2 = us.compact(exp2.split(' '));
if(exp2.length > 0)
tmp.job_company = exp2.shift().trim();
// most below are not valid for 51job format
if(exp2.length > 0)
tmp.job_title = exp2.shift().trim();
if(exp2.length > 0)
tmp.job_spec = exp2.shift().trim();
}
else {
// invalid workexp return null
return null;
}
if(workexp.length <= 0)
return tmp;
// if company is obtained from above match, the first one of workexp must be job company
if(!tmp.job_company)
tmp.job_company = workexp.shift().trim();
// department may not be given, thus we need to guess it could be
// example:
// 所属行业:计算机服务(系统、数据服务、维修) | 软件测试软件测试实习生 | 主要是测试教育软件
//
var values = [],
n = workexp.length;
while(workexp.length > 0) {
var exp = workexp.shift().trim();
if(exp.length <= 0) continue;
if(m=exp.match(/^所属行业[:](.*)?/)) {
if(m[1] && m[1].trim().length > 0)
tmp.job_spec = m[1].trim();
else
tmp.job_spec = workexp.length > 0 ? workexp.shift().trim() : '';
continue;
}
if(m=exp.match(/^工作职位[:](.*)?/)) {
if(m[1] && m[1].trim().length > 0)
tmp.job_title = m[1].trim();
else
tmp.job_title = workexp.length > 0 ? workexp.shift().trim() : '';
continue;
}
if(exp.match(/^规模/g)) {
tmp.job_compsize = exp.replace(/规模:/g,'');
continue;
}
// a list of company types
if(['外资(欧美)', '外资(非欧美)', '合资(欧美)', '合资(非欧美)', '国企', '民营公司', '外企代表处', '政府机关', '事业单位', '非盈利机构', '其它性质'].indexOf(exp) != -1) {
tmp.job_comptype = exp;
continue;
}
// 其它 could be job spec of company type, but it could be only comptype if values has enough values
if(exp == '其它性质' && values.length >= 3) {
tmp.job_comptype = exp;
continue;
}
if(m=exp.match(/^(\d+)(.*?)\/月/)) {
tmp.job_salary = parseInt(m[1]);
// we break here because this is last basic information
// only exception: when job_text is already matched, this is not last one
if(!tmp.job_text)
break;
else
continue;
}
// everything else will be in valid values for further check
values.push(exp);
// in common case, the job test has very long strings, we stop here if next value is a bit long
// the salary string max length could is 15
if(values.length > 3 && workexp.length > 0 && us.first(workexp).trim().length > 15) {
break;
}
}
// remove all empty items
values = us.compact(values);
// values should not be more than 2 items (job depart, title)
// if there are more than 3, we need to move it back to workexp
// workexp should be at least 1
if(values.length > 2 || workexp.length <= 0) {
if(workexp.length <=0 && !tmp.job_text)
workexp = values.splice(values.length-1, values.length);
else
workexp = us.union(values.splice(2, values.length), workexp);
}
// values could be as in example:
// 软件测试 | 软件测试实习生 | 主要是测试教育软件
if(values.length > 0 && !tmp.job_spec)
tmp.job_spec = values.shift();
if(values.length > 0)
tmp.job_depart = values.shift();
if(values.length > 0)
tmp.job_title = values.shift();
// everything else is part of job text
tmp.job_text = (tmp.job_text ? tmp.job_text + '\n' : '') + values.join('\n').trim() + workexp.join('\n').trim();
return tmp;
}
Parser51Job.prototype.updateWorkexp = function(parse_data) {
var tmp_exp, latest_exp, start_date, end_date, diff_months;
parse_data.workexp.forEach(function(exp){
if(!exp.job_company)
return;
if (!tmp_exp || (new Date(exp.job_year_start) > new Date(tmp_exp))){
if(!parse_data.latest_company_name)
parse_data.latest_company_name = exp.job_company.replace(/\((\d+\-\d+(.*)?)\)/g, '').trim();
parse_data.latest_job_spec = exp.job_spec;
parse_data.latest_job_title = exp.job_title;
start_date = new Date(exp.job_year_start);
if (!isNaN(exp.job_year_end.substr(0, 1))) {
end_date = new Date(exp.job_year_end);
if (end_date < start_date) {
end_date = new Date();
}
} else {
end_date = new Date();
}
diff_months = (new Date(end_date - start_date))/1000/60/60/24/30;
diff_months = diff_months > 0 ? Math.round(diff_months) : 0;
if(diff_months<=0){
latest_exp = "";
}
else if (diff_months <12){
latest_exp = diff_months+"个月";
}
else if (diff_months % 12 == 0){
latest_exp = (diff_months/12)+"年";
}
else if (diff_months % 12 != 0){
latest_exp = Math.floor(diff_months/12)+"年"+(diff_months % 12)+"个月";
}
parse_data.latest_exp = latest_exp;
tmp_exp = exp.job_year_start;
}
});
}
Parser51Job.prototype.getEducation_v3 = function($, parse_data) {
// get 教育经历
// 2010/09 -- 2013/06:南京化工职业技术学院 | 信息科学技术 | 大专
// 2010/09 -- 至今:山西大学 | 信息与计算科学 | 硕士
// 2006/09 -- 2010/07:长治学院 | 数学与应用数学 | 本科
var self = this;
var found2 = [];
var found = $('*').filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
if($(this).text().trim().match(/^(教育经历|教育背景)$/g))
return true;
// otherwise search contents
var p = $(this).contents().filter(function() {
if($(this).text().trim().match(/^(教育经历|教育背景)$/g))
return true;
}).last();
// use sub found2
if(p.length > 0)
found2 = p;
}).last();
if(found.length <= 0) {
if(found2.length <= 0)
return;
found = found2;
}
if(found.length <= 0)
return;
// find the section by finding its parent
var sectiontext = found.text().trim();
var p = found;
while(1) {
p = p.parent();
var text = p.text().trim();
if(text != sectiontext)
break;
}
var horizonlayout = false;
var founds = [];
// if p.text is not starting with 教育经历, it means we are going up too far
if(!p.text().trim().match(/^教育经历/)) {
var p = found;
while(1) {
if(p.next().length <= 0) {
if(p.parent().length <=0)
break;
p = p.parent();
continue;
}
var n = p.next();
if(n && n.text().trim().length > 0) {
p = n;
break;
}
// some layout has such format
// <html>
// <body>
// <table>...教育经历..</table>
// <table>...edu 1</table>
// ...
// </body>
// </html>
// OR
// <table>
// <tr><td>2011 /9--2014 /4</td><td>南京理工大学</td><td>计算机技术</td><td>硕士</td><td>some text</td></tr>
// <tr>edu 2 ...</td>
//
// so we check if next item starting with date range
var insection = null;
founds = n.nextAll().filter(function() {
var v = $(this).text().trim();
// FIXME: somehow the pure text field won't be selected, so we have to select it specifically
var n = (this[0].next && this[0].next.type == 'text') ? this[0].next.data.trim() : '';
// we only add the items in the section
if((v.length <= 0 && n.length <=0) || insection === false)
return false;
if(v.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)/g)) {
insection = true;
return true;
}
insection = false;
return false;
});
if(founds.length >0) {
p = n;
horizonlayout = true;
break;
}
p = p.parent();
}
}
// when we reach here, p could be html node when 教育经历 parent node is html
//
// <h3 class="fc6699cc">教育经历</h3> <div class="resume-preview-dl"> 2011.09 - 至今 河海大学 电子信息科学与技术 硕士<br> 2007.09 - 2011.06 黄山学院 电子信息科学与技术 本科<br> </div>
// <tbody><tr><td>2009/09 -- 至今:南京理工大学紫金学院 | 计算机科学与技术 | 本科</td></tr></tbody>
if(!horizonlayout) {
// pp records the parent when entering first education
var pp = false;
var inpos = true;
var founds = $('*', p).filter(function() {
var elem = this[0];
if(!elem || elem.type != 'tag') return false;
// text might be a from parent element which contains multiple experience lines,
// but we only need to keep the single element of having date range
if($(this).children().length <=0) return false;
var text = $(this).text().trim();
// once the common pp is defined, if text only line shows up
// we are not in education section anymore
// we element doesn't share same parent, ignore it too
if(pp && (!$(this).parent().is(pp) || (text.length > 0 && /\D+/.test(text)))) {
inpos = false;
}
if(!inpos)
return false;
if(text.match(/^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)[:]/g)) {
if(horizonlayout && !$(this).parent().is(p))
return false;
if(!horizonlayout) {
if($(this).parent().is(p)) horizonlayout = true;
}
pp = $(this).parent();
return true;
}
// save as above but with school together
if(text.match(/^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)\s*\S+/g)) {
if(horizonlayout && !$(this).parent().is(p))
return false;
if(!horizonlayout) {
if($(this).parent().is(p)) horizonlayout = true;
}
pp = $(this).parent();
return true;
}
});
}
// get multiple line of education
var educations = [], pattern = null, m = null;
founds.each(function(i, el) {
// replace delimiter
$(this).find('br').each(function() {
$(this).replaceWith('@@@@');
})
// replace any tag with delimiter to correct build text
var p = this;
if($(this).children().length > 0) {
$('*', p).each(function() {
var elem = this[0];
if($(this).children() <= 0) {
if($(this).text().length > 0)
$(this).html($(this).html() + " |");
else {
$(this).replaceWith(' | ');
}
}
})
}
var values = us.compact($(this).text().replace(/(\d+)\s*[\.\/]/g, "$1/").trim().split(/@@@@/g));
// values might have multiple education for horizontal layout
var nv = [];
values.forEach(function(v) {
var n = us.compact(v.split(/(\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})/));
// if more than one education presents, n is larger than 2
// '2011/9--', '2014/4 | 南京理工大学 | 计算机技术 | 硕士
// '2007/9--', '2011/6 | 江苏理工学院 | 计算机科学与技术 | 本科
while(n.length > 0) {
nv = nv.concat(n.shift() + n.shift());
}
})
values = nv;
// the next value could be another education, otherwise ignore it
values = values.filter(function(v) {
if(!(m=v.trim().match(/^(\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})/)))
return false;
if(!pattern) {
// remember first date range as pattern. The each following workexp should have same pattern
pattern = resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''));
}
else {
// the max text distance is 0: 1111/11--1111/11 => 1111/11--1111/11
if(stringsim.distance(pattern, resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''))) > 0) {
// most likely this is not a new workexp
return false;
}
}
return true;
});
if(values.length <= 0)
return;
values = values.map(function(v) { return v.trim(); });
educations = us.union(educations, values);
});
parse_data.education = [];
us.unique(us.compact(educations)).forEach(function(v) {
var e = self.parseEducation_v3(v.trim());
if(e)
parse_data.education[parse_data.education.length++] = e;
});
this.updateEducation(parse_data);
}
Parser51Job.prototype.parseEducation_v3 = function(education) {
// make sure 2010.09 - 2014.07 => 2010.09-2014.07 or 2010/09 -- 2013/06: => 2010/09-2013/06:
education = education.replace(/\s*\-+\s*/g, '-').split('|');
if(education.length <= 1) {
education = education.shift().split(' ');
}
education = us.compact(education);
if(!education || education.length <= 1)
return null;
var m, tmp = {}, edu = education.shift().trim();
// example:
// 2010/09-2013/06:南京化工职业技术学院
// 2010.09-2014.07
// 2008/9 - 2012/7 南京理工大学紫金学院 计算机科学与技术 本科
if(m = edu.match(/\d{4}[\/\.]\d{1,2}\-(.*)?/)) {
var values = m[0].split(/[:\s]/);
var school_years = values.shift().trim().split('-');
tmp.school_year_start = school_years.shift().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');
tmp.school_year_end = school_years.shift().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');
if(values.length > 0)
tmp.school_name = values.shift().trim();
if(values.length > 0)
tmp.major_name = values.shift().trim();
if(values.length > 0)
tmp.degree_level = values.shift().trim();
// if first doesn't have school name, school name should be second
if(!tmp.school_name || tmp.school_name.length <= 0)
tmp.school_name = education.shift().trim();
}
if(education.length > 0 && !tmp.major_name)
tmp.major_name = education.shift().trim();
if(education.length > 0 && !tmp.degree_level)
tmp.degree_level = education.shift().trim();
// degree level should not be a large string otherwise we are on wrong data
if(!tmp.degree_level || tmp.degree_level.length > 5)
return null;
// ignore major name if degree level is high school
if(tmp.degree_level == '高中')
tmp.major_name = '';
// if we don't have valid degree, ignore it
// if(['高中', '中专', '中技', '大专', '本科', '硕士', '研究生', '博士', 'MBA', 'EMBA']
return tmp;
}
Parser51Job.prototype.updateEducation = function(parse_data) {
var tmp_edu = 0;
parse_data.education.forEach(function(edu){
if(!edu) return;
if (!tmp_edu || (new Date(edu.school_year_start) > new Date(tmp_edu))){
parse_data.highest_degree_level = edu.degree_level;
parse_data.highest_major_name = edu.major_name;
parse_data.highest_school_name = edu.school_name;
tmp_edu = edu.school_year_start;
}
});
}