casperjs 解析51job履历

casperjs 解析51job简历

casperjs抓取对应的url的简历，解析获取的html，进行获取匹配数据

'use strict';

var cheerio = require('cheerio');

var us = require('underscore');

var stringsim = require("stringsim");

var resumeutil = require("resumeutil");

// Expose to the world

var parse51Job = module.exports = function(subject, html, callback) {

return new Parser51Job(subject, html, callback);

};

function Parser51Job(subject, html, callback) {

return this.parse(subject, html, callback);

}

Parser51Job.prototype.parse = function(subject, html, callback) {

if(!html)

return null;

var data = this.parse_v3(html, callback);

this.parseSubject(data, subject);

return this.sanityCheck(data);

}

Parser51Job.prototype.sanityCheck = function(data) {

if(!data)

return data;

// we do sanity check on data before return

if(data.mobile)

data.mobile = resumeutil.convertUnicodeNumber(data.mobile);

if(data.email)

data.email = resumeutil.getEmail(data.email);

// convert 三年以上工作经验 to 3年以上工作经验

if(data.exp)

data.exp = resumeutil.convertChineseNumber(data.exp);

return data;

}

Parser51Job.prototype.parseSubject = function(data, subject) {

if(!data)

return;

data.job_title = null;

if(subject) {

// (51job.com)申请贵公司安卓开发工程师（南京）－王茜（wang xi）

// (51job)转发简历高级php开发工程师(南京-建邺区)

if(/$.*$转发简历/.test(subject)) {

subject = subject.replace(/\t|\n/g,'').replace(/^(.*)转发简历/, '').replace(/\)/, '');

var list = subject.split(/\(/);

if(list.length > 0){

data.job_title = list[0] ? list[0].trim() : null;

var location = list[1] ? list[1].split('-') : [];

data.jobcity = location[0] ? location[0].trim() : '';

data.jobcitydist = location[1] ? location[1].trim() : '';

}

}else if (/$.*$申请贵公司/.test(subject)){

subject = subject.replace(/\t|\n/g,'').replace(/^(.*)申请贵公司/, '');

var list = subject.split(/－/);

if(list.length > 1)

data.name = list.pop().trim();

var title = list.join('－');

var m = null;

if(m = title.match(/(.*)（(.*?)）$/)) {

data.job_title = m[1].trim();

var location = m[2].split('-');

data.jobcity = location[0].trim();

data.jobcitydist = location[1] ? location[1].trim() : '';

}

else

data.job_title = title.trim();

}

// v3 is third version of parsing

Parser51Job.prototype.parse_v3 = function(html, callback) {

var self = this;

var m, parse_data = {};

var html = html.replace(/\t|\n/g,'').replace(/ /g, ' ').replace(/\s+/g, ' ').replace(/：/g, ':').replace(/= ("|')/g, "=$1");

var $ = cheerio.load(html, {decodeEntities: true});

parse_data.update_time = $('#__ppp_update_time').text().trim();

// we use relative-to-key-word parsing method to find relative information

parse_data.source = '51job';

// before we processing the html, let's see whether it contains a link to open up the resumes. Only do it when callback is given for overwrite

if(callback) {

var hrefs = [];

var links = $('a').filter(function(v) {

if($(this).attr('href') && $(this).attr('href').match(/\/Candidate\/ResumeViewSingle.aspx/)) {

hrefs.push($(this).attr('href'));

return true;

}

return false;

});

if(hrefs.length > 0) {

// this is forwarding resume html, we need to open up the link, and save the content to overwrite resumefile

callback(hrefs);

return null;

}

// 应聘职位

var found = $('*:contains("应聘职位:")').last();

if(found.length > 0) {

var located = false;

if(found.text().trim() == '应聘职位:') {

while(found.length > 0) {

var s = found.text().trim();

if(s != '应聘职位:') {

// only need to split ': '. the space should be followed by :

var values = us.compact(s.replace(/(.*)应聘职位:/, '应聘职位:').split(/:(?=\s*)/));

if(values.length <= 1) {

found = found.parent();

continue;

}

if(values.length >2) {

//we have some format such as: m: value n: value (resume17.html)

var n = values[1].trim().split(/\s+/);

if(n <=1)

parse_data.job_title2 = n.shift();

else {

n.pop();

parse_data.job_title2 = n.join(' ');

}

else

parse_data.job_title2 = values[1].trim();

break;

}

else {

found = found.parent();

}

else {

found.contents().each(function() {

var s = $(this).text().trim();

if(s == '应聘职位:') {

located = true;

}

else if(s.length > 0 && located) {

parse_data.job_title2 = s;

return false;

}

});

}

// 应聘公司:

found = $('*:contains("应聘公司:")').last();

if(found.length > 0) {

var located = false;

if(found.text().trim() == '应聘公司:') {

while(found.length > 0) {

var s = found.text().trim();

if(s != '应聘公司:') {

var values = us.compact(s.replace(/(.*)应聘公司:/, '应聘公司:').split(/:(?=\s*)/));

if(values.length <= 1) {

found = found.parent();

continue;

}

if(values.length >2) {

//we have some format such as: m: value n: value (resume17.html)

var n = values[1].trim().split(/\s+/);

if(n <=1)

parse_data.company_name = n.shift();

else {

n.pop();

parse_data.company_name = n.join(' ');

}

else

parse_data.company_name = values[1].trim();

break;

}

else {

found = found.parent();

}

else {

found.contents().each(function() {

var s = $(this).text().trim();

if(s == '应聘公司:') {

located = true;

}

else if(s.length > 0 && located) {

parse_data.company_name = s;

return false;

}

});

}

// 投递时间:

if(!parse_data.update_time) {

found = $('*:contains("投递时间:")').last();

if(found.length > 0) {

var located = false;

if(found.text().trim() == '投递时间:') {

while(found.length > 0) {

var s = found.text().trim();

if(s != '投递时间:') {

var values = us.compact(s.replace(/(.*)投递时间:/, '投递时间:').split(/:(?=\s*)/));

if(values.length <= 1) {

found = found.parent();

continue;

}

if(values.length >2) {

//we have some format such as: m: value n: value (resume17.html)

var n = values[1].trim().split(/\s+/);

if(n <=1)

parse_data.update_time = n.shift();

else {

n.pop();

parse_data.update_time = n.join(' ');

}

else

parse_data.update_time = values[1].trim();

break;

}

else {

found = found.parent();

}

else {

found.contents().each(function() {

var s = $(this).text().trim();

if(s == '投递时间:') {

located = true;

}

else if(s.length > 0 && located) {

parse_data.update_time = s;

return false;

}

});

}

// 工作地点:

// 高级项目经理（规划设计分院）（深圳-南山区）

// 初级软件测试工程师（南京-建邺区）

if(parse_data.job_title2 && parse_data.job_title2.length > 0) {

// use not include !($|$|）|（)

if(m = parse_data.job_title2.match(/[$（](((?!(\(|$|）|（)).)*)[）\)]$/)) {

var location = m[1].split('-');

parse_data.jobcity = location.shift().trim();

parse_data.jobcitydist = location.length > 0 ? location.shift().trim() : '';

parse_data.job_title2 = parse_data.job_title2.replace(m[0], '').trim();

}

// pre-fill some information

parse_data.highest_degree_level = '';

parse_data.highest_major_name = '';

parse_data.highest_school_name = '';

parse_data.latest_company_name = '';

parse_data.latest_job_spec = '';

parse_data.latest_job_title = '';

parse_data.latest_exp = '';

// basic information

var located = false;

found = $('*').filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

// check first posibility

if($(this).text().trim().match(/\|\s+(男|女|Male|Female)\s+/ig)) {

located = true;

return true;

}

// use first match to avoid reading user's input (resume18.html)

if(located)

return;

return $(this).text().trim().match(/^(男|女|Male|Female)\s+/ig);

}).last();

// found could be:

// 应届毕业生 | 男 | 21岁（1992年6月23日） | 未婚 | 175cm| 团员

// 一年以上工作经验 | 女 | 23岁（1990年6月22日） | 未婚 | 170cm| 群众

// 在读学生 | 女 | 15岁（1999年2月6日）

// 二年以上工作经验 | 男 | 24岁（1989年10月2日） | 未婚 | 175cm| 中共党员 | 身份证： 320626196607065213

// 男 | 30岁(1983年6月17日)

// > 10 years| Male | 34Years old(1979/3/24)

if(found.length > 0) {

var foundtext = found.text().trim();

var values = foundtext.split('|');

if(values.length <= 1)

values = values.shift().split(/\s+/);

else {

// add | before first item to split other tags when there is not work status

found.prepend('<span> | </span>');

foundtext = found.text().trim();

values = foundtext.split('|');

}

values = us.compact(values);

if(values.length > 0)

parse_data.exp = values.shift().trim();

// get rest

while(values.length > 0) {

var v = values.shift().trim();

if(m = v.match(/(男|女|Female|Male)/ig)) {

var sex = m[0].toLowerCase();

parse_data.sex = sex == 'female'? '女' : (sex == 'male' ? '男': sex);

}

if(m = v.match(/^(\d+)(岁|Years old)/i)) {

parse_data.age = m[1];

}

// two formats:

// 1966 年7月生

// 1966年7月6日

if(m = v.match(/\d+\s*年\d+月(\d+日|生)/g)) {

parse_data.birth = m[0].replace(/\s/, '').replace(/年|月/g,'-').replace(/日|生/g,'').replace(/\b(\w)\b/g, '0$1');

if(/\-$/.test(parse_data.birth))

parse_data.birth += '01';

continue;

}

if(m = v.match(/\d+\s*\/\d+\/\d+/g)) {

parse_data.birth = m[0].replace(/\s/, '').replace(/\//g,'-').replace(/\b(\w)\b/g, '0$1');

continue;

}

if(v.match(/婚$/g) || v.match(/单身/g) || v.match(/离异/g)) {

parse_data.marriage = v;

continue;

}

if(m = v.match(/^(\d+)cm/)) {

parse_data.height = m[1];

break;

}

if(values.length > 0) {

parse_data.socialparty = values.shift().trim();

}

// get name. The name is placed before basic information

// 流程状态:xxx 标签: 在读学生 | 女 | 22岁（1991年8月9日）(ID:309207915)居住地:南京-雨花台区户口:扬州电话:15295753639（手机）E-mail:15295753639@163.com

var p = found;

while(1) {

p = p.parent();

if(p.length <= 0)

break;

var text = p.text().trim();

var pos = text.indexOf(foundtext);

if(pos > 0) {

text = text.replace(/^流程状态:/g, '').replace(/^已转发/g, '');

pos = text.indexOf(foundtext);

var fronttext = text.substring(0, pos).trim();

var lasttext = text.substring(pos, text.length).trim();

// the name is ahead of foundtext, but we need to figure out (resume17.html)

// 1. if it's english(a-z), keep to reading back till we meet non-english; space is allowed

// 2. if it's chinese, till read we have at least three chinese chars; space is allowed

var values = us.compact(fronttext.split(/\s+(?![a-z]+）)/));

var i = 0, onlyenglish=false;

while(1) {

if(i>= values.length)

break;

// remove special char or only numbers

if(/(:|^\d+$)/.test(values[values.length-1-i])) {

values.splice(values.length-1-i, 1);

continue;

}

else if(/^[a-z][a-z0-9]*$/i.test(values[values.length-1-i])) {

onlyenglish = true;

}

else {

onlyenglish = false;

}

i++;

// the name should not be up to 3 levels

if((i >=3 && !onlyenglish) || (onlyenglish && i>=5))

break;

}

// if only english, we need to combind together

if(onlyenglish)

parse_data.name = values.join(' ').trim();

else if (values.length > 2)

parse_data.name = values.pop();

else

parse_data.name = values.shift();

// remove some special chars -- or ―

if(parse_data.name)

parse_data.name = parse_data.name.replace(/[\-―](.*)/, '');

// look for ID directly

if(m = lasttext.match(/$ID:(\d+)$/)) {

parse_data.resumecode = m[1];

}

break;

}

// get next basic information

// layout:

// 居住地:北京-大兴区

// <td>居住地:</td>

// <td>xxx</td>

// <td>户　口：</td>

// <td>xxx</td>

// 电话:<b>15510491669</b>（手机）

var found = $('*:contains("居住地:")', p).last();

if(found.length > 0) {

var location = found.text().trim();

if(location == "居住地:")

location = found.next().text();

location = location.replace(/居住地:/, '').split('-');

parse_data.city = location.shift().trim();

parse_data.citydist = location.length > 0 ? location.shift().trim() : '';

}

var found = $('*:contains("户口:")', p).last();

if(found.length > 0) {

var residence = found.text().trim();

if(residence == "户口:")

residence = found.next().text().trim();

parse_data.residence = residence.replace(/户口:/, '');

}

var found = $('*:contains("电话:")', p).last();

if(found.length > 0) {

var mobile = found.text().trim();

if(mobile == "电话:")

mobile = found.next().text().trim();

parse_data.mobile = mobile.replace(/电话:/, '').replace(/（手机）/, '').trim();

}

// english telephone number:

var found = $('*:contains("Telephone number:")', p).last();

if(found.length > 0) {

var mobile = found.text().trim();

if(mobile == "Telephone number:")

mobile = found.next().text().trim();

parse_data.mobile = mobile.replace(/Telephone number:/, '').replace(/$MobilePhone$/, '').trim();

}

var found = $('*:contains("E-mail:")', p).last();

if(found.length > 0) {

var email = found.text().trim();

if(email == "E-mail:")

email = found.next().text().trim();

parse_data.email = email.replace(/E-mail:/, '').trim();

}

// national ID

var found = $('*:contains("身份证:")', p).last();

if(found.length > 0) {

var nationid = found.text().trim();

if(nationid == "身份证:")

nationid = found.next().text().trim();

parse_data.nationid = nationid.replace(/身份证:/, '').trim();

}

else {

var first = 1;

found = $('*').filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

// we only need to obtain first basic information resume11.html has multiple basic information provided by user custom input

if(first && $(this).text().trim().match(/^(基本信息|Basic Info.)$/g)) {

first = 0;

return true;

}

return false;

}).last();

// possible layout:

// <td style="width:width:14%" class="weight100"> 姓名： </td>

// <td style="width:190px" class="weight190"> xxx </td>

// <td style="width:width:14%" class="weight100"> 性别： </td>

// <td style="width:190px" class="weight190"> 女 </td>

// <td rowspan="7" style="width:width:14%" class="weight100" valign="top">

// <td> 出生日期： </td> <td> 1990年7月30日 </td>

// <td> 居地： </td> <td> 南京 </td> </tr>

// <td> 工作年限： </td> <td> 一年以上 </td>

// <td> 电子邮件： </td> <td> layla1990@qq.com </td>

// <td> 学历： </td> <td> 本科 </td>

// <td> 专业： </td> <td> 计算机科学与技术 </td>

// <td> 职能： </td> <td> 活动策划 </td>

// <td> 行业： </td> <td> 互联网/电子商务 </td>

// <td> 手机号码： </td> <td colspan="3"> 13951087924 </td>

// <td> 公司电话： </td> <td colspan="3"> 086- 029- 88371041 </td>

// <td> 关键词： </td> <td colspan="3"> 建筑设计城市规划机场总图土地开发 </td>

if(found.length > 0) {

var keys = [['name', '姓名:'],

['name', 'Name:'],

['sex', '性别:'],

['sex', 'Gender:'],

['birth', '出生日期:'],

['birth', 'Date of Birth:'],

['location', '居住地:'],

['location', 'Residency:'],

['exp', '工作年限:'],

['exp', 'Yrs.of Experience::'],

['email', '电子邮件:'],

['email', 'Email:'],

['mobile', '手机号码:'],

['mobile', 'Mobile Phone:'],

['companyphone', '公司电话:'],

['highest_degree_level', '学历:'],

['highest_degree_level', 'Degree:'],

['highest_major_name', '专业:'],

['highest_major_name', 'Major:'],

['latest_job_pos', '职能:'],

['latest_job_pos', 'Job Category:'],

['latest_job_spec', '行业:'],

['latest_job_spec', 'Industry:'],

['keyword', '关键词:']];

var p = found.next();

keys.forEach(function(o) {

var k = o.shift(), v=o.shift();

var found = $('*:contains("'+v+'")', p).last();

if(found.length > 0) {

if(k == 'location') {

var location = found.next().text().split('-');

parse_data.city = location.shift().trim();

parse_data.citydist = location.length > 0 ? location.shift().trim() : '';

return;

}

else if(k == 'birth') {

parse_data.birth = found.next().text().trim().replace(/\//g, '-').replace(/年|月/g,'-').replace(/日/g,'').replace(/\b(\w)\b/g, '0$1');

return;

}

else

parse_data[k] = found.next().text().trim();

}

})

}

// national ID. This can be anywhere, so we have to search globally

var found = $('*:contains("身份证:")').last();

if(found.length > 0) {

var nationid = found.text().trim();

nationid = nationid.replace(/(.*)身份证:/, '').trim().match(/^\d+/);

if(nationid.length > 0)

parse_data.nationid = nationid.shift();

}

// latest_job_spec/highest degree level might be in 最近工作 and 最高学历 sections

if(!parse_data.latest_job_spec || !parse_data.highest_degree_level) {

// 最近工作

var found = $('*').filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

return $(this).text().trim().match(/^最近工作\s*\[/g);

}).last();

// format:

// <table><tr><td><b>最近工作</b></span><span><b>[ 2个月]</b></span></td></tr>

// <tr><td width="59">公　司：</td><td width="230">南京智风多媒体有限公司</td></tr>

// <tr><td>行　业：</td><td>计算机服务(系统、数据服务、维修)</td></tr>

// <tr><td>职　位：</td><td>软件测试实习生</td></tr></tbody>

// </table>

if(found.length > 0) {

var p = found;

while(1) {

var section = $('*:contains("公司:")', p).last();

if(section.length > 0) {

// found the root node

parse_data.latest_company_name = section.next().text().trim();

break;

}

p = p.parent();

if(p.length <= 0)

break;

}

if(parse_data.latest_company_name) {

section = $('*:contains("行业:")', p).last();

if(section.length > 0)

parse_data.latest_job_spec = section.next().text().trim();

section = $('*:contains("职位:")', p).last();

if(section.length > 0)

parse_data.latest_job_title = section.next().text().trim();

}

// 最高学历

var found = $('*').filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

return $(this).text().trim().match(/^最高学历$/g);

}).last();

// format:

// <td><tbody><tr><td colspan="2"><span class="font14 blue"><b>最高学历</b></span></td></tr>

// <tr><td width="59">学　历：</td><td width="230">大专</td></tr>

// <tr><td>专　业：</td><td>模具设计与制造</td></tr>

// <tr><td height="22">学　校：</td><td>昆山登云科技职业学院</td></tr>

// </tbody></table></td>

if(found.length > 0) {

var p = found;

while(1) {

var section = $('*:contains("学历:")', p).last();

if(section.length > 0) {

// found the root node

parse_data.highest_degree_level = section.next().text().trim();

break;

}

p = p.parent();

if(p.length <= 0)

break;

}

if(parse_data.highest_degree_level) {

section = $('*:contains("专业:")', p).last();

if(section.length > 0)

parse_data.highest_major_name = section.next().text().trim();

section = $('*:contains("学校:")', p).last();

if(section.length > 0)

parse_data.highest_school_name = section.next().text().trim();

}

this.getWorkexp_v3($, parse_data);

this.getEducation_v3($, parse_data);

return parse_data;

}

Parser51Job.prototype.getWorkexp_v3 = function($, parse_data) {

// get 工作经历

// 2013/02 -- 2013/09：江苏联盛科技有限公司 | 技术部 | java软件工程师 | IT服务（系统/数据/维护）/多领域经营 | 民营 | 规模:20-99人 | 2001-4000元/月 | asldkfj

// 2014.01 - 至今 333 销售代表互联网/电子商务 | 企业性质：外商独资工作描述： 333

var self = this;

var found2 = [];

var found = $('*').filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

if($(this).text().trim().match(/^(工作经验|职业经历|工作经历)[:]?$/g))

return true;

// otherwise search contents

var p = $(this).contents().filter(function() {

if($(this).text().trim().match(/^(工作经验|职业经历|工作经历)[:]?$/g))

return true;

}).last();

// use sub found2

if(p.length > 0)

found2 = p;

}).last();

if(found.length <= 0) {

if(found2.length <= 0)

return;

found = found2;

}

// find the section by finding its parent

var sectiontext = found.text().trim();

var p = found;

while(1) {

p = p.parent();

var text = p.text().trim();

if(text != sectiontext)

break;

}

var horizonlayout = false, topend = false;

var founds = [];

// if p.text is not starting with 工作经历, it means we are going up too far

// the section should be next node

var p = found;

while(1) {

if(p.next().length <= 0) {

if(p.parent().length <=0)

break;

p = p.parent();

continue;

}

var n = p.next();

if(n && n.text().trim().length > 0) {

p = n;

break;

}

// some layout has such format

// <html>

// <body>

// <table>...工作经历..</table>

// <table>...work exp 1</table>

// <table>...work exp 2</table>

// ...

// </body>

// </html>

// so we check if next item starting with date range

var insection = null;

founds = n.nextAll().filter(function() {

var v = $(this).text().trim();

// FIXME: somehow the pure text field won't be selected, so we have to select it specifically

var n = (this[0].next && this[0].next.type == 'text') ? this[0].next.data.trim() : '';

if((v.length <=0 && n.length <= 0) || insection === false)

return false;

v = v.replace(/(\d+)年(\d+)月/g, '$1/$2');

if(v.match(/^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)/g)) {

insection = true;

return true;

}

insection = false;

return false;

});

if(founds.length >0) {

p = n;

horizonlayout = true;

break;

}

// look for next parent

p = p.parent();

// if we the workexp is included in next parent. when workexp is deep under

// text mode, previous nextAll might not get it:

// <div align="left"><b>职业经历</b>

// <b> </b>

// <br>

// 2011.09-至今：西部机场集团规划发展部，机场规划与土地管理主管

// </div>

// <div align="left">2008.9-2011.09：深圳机场集团扩建工程指挥部，规划设计部，规划设计经理</div>

var text = p.text().trim();

var regexp = new RegExp("^" + sectiontext);

if(text.match(regexp)) {

topend = true;

// we reach the top of section, has to stop here

break;

}

// search multiple lines format:

// 2013/07 -- 2013/10:<company name>|

// 2013/07 -- 2013/10 ...

// some other horizontal layout:

// horizon layout is not coded as section, but as such:

// <workexp head 1>

// <basic inform 1>

// <basic inform 11>

// <workexp job text>

// <workexp head 2>

// <basic inform 2>

// <basic inform 22>

// <worexp job text>

// som other non-horizontal layout:

// <div align="left"><b>职业经历</b>

// <b> </b>

// <br>

// 2011.09-至今：西部机场集团规划发展部，机场规划与土地管理主管

// </div>

// <div align="left">2008.9-2011.09：深圳机场集团扩建工程指挥部，规划设计部，规划设计经理</div>

// FIXME: need to parse uncommon format

if(!horizonlayout) {

if(topend) {

// p itself

var regexp = new RegExp("^" + sectiontext);

var text = p.text().replace(regexp, '').trim();

if(text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)[:]$/g) ||

text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)\s+\S+/g)) {

var html = p.html().replace(sectiontext, '');

p.empty().html(html);

}

founds = $('*', p).filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

// text might be a from parent element which contains multiple experience lines,

// but we only need to keep the single element of having date range

if($(this).children().length > 0) return false;

var text = $(this).text().trim().replace(/(\d+)年(\d+)月/g, '$1/$2');

if(text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)[:]$/g)) {

if(horizonlayout && !$(this).parent().is(p))

return false;

if(!horizonlayout) {

if($(this).parent().is(p)) horizonlayout = true;

}

return true;

}

// save as above but with company together

if(text.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)\s+\S+/g)) {

if(horizonlayout && !$(this).parent().is(p))

return false;

if(!horizonlayout) {

if($(this).parent().is(p)) horizonlayout = true;

}

return true;

}

});

}

// get multiple full line of work experience

// found is only the list of single text node. We need to track back to its parent node which contains full experience line text

var workexps = [];

if(horizonlayout) {

// when section is in horizontal format, we can easily combine texts

founds.each(function(i, el) {

var c = $(this),

n = null,

m = null,

pattern = null;

c.find('br').each(function() {

$(this).replaceWith('<span> | </span>');

});

// this is the node we need to obtain all text from next all elements

// c is full complete text in format as:

// <tr><td>2013 /12--2014 /2:南京智风多媒体有限公司(50-150人)<span style="color:#676767;"><b> [ 2个月] </b></span></td></tr>

// <tr><td width="22%" class="text_left">所属行业:</td><td width="78%" class="text">计算机服务(系统、数据服务、维修)</td></tr>

// <tr><td class="text_left"><b>软件测试</b></td><td class="text"><b>软件测试实习生</b></td></tr>

// <tr><td colspan="2" class="text_left">主要是测试教育软件，大学视频教学，数字化教学资源、多媒体产品<br>测试工具主要是公司研发的自主软件<br>管理软件是:OA小型公司管理系统</td></tr>

// <tr><td> 2008 /3--2012 /12:天泽信息产业股份有限公司（150-500人） [ 4年9个月]</td></tr>

// ...workexp 2

var f = function(n) {

var text = n.text().trim().replace(/(\d+)\s*[\.\/]/g, "$1/");

while(1) {

n = n.next();

// need to see if next element is in found list, if so, it's next workexp

// the next level should not reach more than 5 level down

var nt = n.length <= 0 ? '' : n.text().replace(/(\d+)年(\d+)月/g, '$1/$2').trim();

// end of section

if(n.length <= 0) {

workexps[workexps.length++]= text.replace(sectiontext, '').replace(/(\d+)年(\d+)月/g, '$1/$2').trim();

if(n.length <= 0)

break;

}

// new workexp ?

if(m=nt.match(/(^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})\s*(.*?)/g)) {

// it should match date start pattern

if(stringsim.distance(pattern, resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''))) == 0) {

workexps[workexps.length++]= text.replace(sectiontext, '').replace(/(\d+)年(\d+)月/g, '$1/$2').trim();

// good for next workexp

text = nt.replace(/(\d+)\s*[\.\/]/g, "$1/");

continue;

}

// replace br with delimiter

n.find('br').each(function() {

$(this).replaceWith('<span> | </span>');

});

n.children().each(function() {

var v = $(this).text().trim();

if(v.length > 0)

text = text + " | " + v;

})

}

};

$('*', c).each(function() {

if(m = $(this).html().replace(/(\d+)年(\d+)月/g, '$1/$2').trim().match(/(^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})\s*(.*?)/g)) {

if(!pattern) {

// remember first date range as pattern. The each following workexp should have same pattern

pattern = resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''));

}

else {

// the max text distance is 0: 1111/11--1111/11 => 1111/11--1111/11

if(stringsim.distance(pattern, resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''))) > 0) {

// most likely this is not a new workexp

return true;

}

var t = $(this).text().trim();

n = $(this);

while(1) {

if(n.parent().text().trim() != t) {

f(n);

break;

}

n = n.parent();

}

return false;

}

})

}

else {

// the next each is used to handle non-horizontal layout

founds.each(function(i, el) {

var text = $(this).text().trim();

var p = $(this);

while(1) {

p = p.parent();

var ntext = p.text().trim();

if(ntext != text) {

// replace br with delimiter

p.find('br').each(function() {

$(this).replaceWith('<span> | </span>');

})

// replace any tag with delimiter to correct build text

if(p.children().length > 0) {

$('*', p).each(function() {

var elem = this[0];

if($(this).children() <= 0) {

if($(this).text().length > 0)

$(this).html($(this).html() + " |");

else {

$(this).replaceWith(' | ');

}

})

}

workexps[workexps.length++]= p.text().replace(sectiontext, '').trim();

break;

}

});

}

parse_data.workexp = [];

us.unique(workexps).forEach(function(v) {

var w = self.parseWorkexp_v3(v);

if(w)

parse_data.workexp[parse_data.workexp.length++] = w;

});

this.updateWorkexp(parse_data);

}

Parser51Job.prototype.parseWorkexp_v3 = function(workexp) {

var m, tmp = {};

// get out known things if they are present

// some resumes have these

if(m=workexp.match(/企业性质:(.*?)\s+/)) {

tmp.job_comptype = m[1].trim().replace(/^\|/, '').trim();

workexp = workexp.replace(m[0], '');

}

if(m=workexp.match(/工作描述:(.*?)$/)) {

tmp.job_text = m[1].trim().replace(/^\|/, '').trim();

workexp = workexp.replace(m[0], '');

}

workexp = us.compact(workexp.trim().split('|'));

var exp = workexp.shift().trim();

if(m = exp.match(/(\d{4}\s*[/\.]\d{1,2}\s*[-]{1,2}\s*(.*?))[:\s]/)) {

var job_years = m[1].trim().replace('--', '-').split('-');

tmp.job_year_start = job_years.shift().trim().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');

tmp.job_year_end = job_years.shift().trim().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');

tmp.job_months = 0;

var now_job = false;

try {

var start_date = new Date(tmp.job_year_start);

if (/^\d/.test(tmp.job_year_end)){

var end_date = new Date(tmp.job_year_end);

}else{

if(now_job){

var end_date = start_date;

}else{

var end_date = new Date();

now_job = true;

}

var diff_months = (new Date(end_date - start_date))/1000/60/60/24/30;

tmp.job_months = diff_months > 0 ? Math.round(diff_months) : 0;

} catch(e) { }

// 南京智风多媒体有限公司(50-150人) [ 2个月]

var exp2 = exp.replace(m[0], '').replace(/\[(.*)?\]$/g, '');

// find out the company size

m = exp2.match(/[$（]([少于|\d+\-\d+](.*)?)[）$]/);

if(m) {

exp2 = exp2.replace(m[0], '');

tmp.job_compsize = m[1].trim();

}

var exp2 = us.compact(exp2.split(' '));

if(exp2.length > 0)

tmp.job_company = exp2.shift().trim();

// most below are not valid for 51job format

if(exp2.length > 0)

tmp.job_title = exp2.shift().trim();

if(exp2.length > 0)

tmp.job_spec = exp2.shift().trim();

}

else {

// invalid workexp return null

return null;

}

if(workexp.length <= 0)

return tmp;

// if company is obtained from above match, the first one of workexp must be job company

if(!tmp.job_company)

tmp.job_company = workexp.shift().trim();

// department may not be given, thus we need to guess it could be

// example:

// 所属行业:计算机服务(系统、数据服务、维修) | 软件测试软件测试实习生 | 主要是测试教育软件

var values = [],

n = workexp.length;

while(workexp.length > 0) {

var exp = workexp.shift().trim();

if(exp.length <= 0) continue;

if(m=exp.match(/^所属行业[:](.*)?/)) {

if(m[1] && m[1].trim().length > 0)

tmp.job_spec = m[1].trim();

else

tmp.job_spec = workexp.length > 0 ? workexp.shift().trim() : '';

continue;

}

if(m=exp.match(/^工作职位[:](.*)?/)) {

if(m[1] && m[1].trim().length > 0)

tmp.job_title = m[1].trim();

else

tmp.job_title = workexp.length > 0 ? workexp.shift().trim() : '';

continue;

}

if(exp.match(/^规模/g)) {

tmp.job_compsize = exp.replace(/规模:/g,'');

continue;

}

// a list of company types

if(['外资(欧美)', '外资(非欧美)', '合资(欧美)', '合资(非欧美)', '国企', '民营公司', '外企代表处', '政府机关', '事业单位', '非盈利机构', '其它性质'].indexOf(exp) != -1) {

tmp.job_comptype = exp;

continue;

}

// 其它 could be job spec of company type, but it could be only comptype if values has enough values

if(exp == '其它性质' && values.length >= 3) {

tmp.job_comptype = exp;

continue;

}

if(m=exp.match(/^(\d+)(.*?)\/月/)) {

tmp.job_salary = parseInt(m[1]);

// we break here because this is last basic information

// only exception: when job_text is already matched, this is not last one

if(!tmp.job_text)

break;

else

continue;

}

// everything else will be in valid values for further check

values.push(exp);

// in common case, the job test has very long strings, we stop here if next value is a bit long

// the salary string max length could is 15

if(values.length > 3 && workexp.length > 0 && us.first(workexp).trim().length > 15) {

break;

}

// remove all empty items

values = us.compact(values);

// values should not be more than 2 items (job depart, title)

// if there are more than 3, we need to move it back to workexp

// workexp should be at least 1

if(values.length > 2 || workexp.length <= 0) {

if(workexp.length <=0 && !tmp.job_text)

workexp = values.splice(values.length-1, values.length);

else

workexp = us.union(values.splice(2, values.length), workexp);

}

// values could be as in example:

// 软件测试 | 软件测试实习生 | 主要是测试教育软件

if(values.length > 0 && !tmp.job_spec)

tmp.job_spec = values.shift();

if(values.length > 0)

tmp.job_depart = values.shift();

if(values.length > 0)

tmp.job_title = values.shift();

// everything else is part of job text

tmp.job_text = (tmp.job_text ? tmp.job_text + '\n' : '') + values.join('\n').trim() + workexp.join('\n').trim();

return tmp;

}

Parser51Job.prototype.updateWorkexp = function(parse_data) {

var tmp_exp, latest_exp, start_date, end_date, diff_months;

parse_data.workexp.forEach(function(exp){

if(!exp.job_company)

return;

if (!tmp_exp || (new Date(exp.job_year_start) > new Date(tmp_exp))){

if(!parse_data.latest_company_name)

parse_data.latest_company_name = exp.job_company.replace(/$(\d+\-\d+(.*)?)$/g, '').trim();

parse_data.latest_job_spec = exp.job_spec;

parse_data.latest_job_title = exp.job_title;

start_date = new Date(exp.job_year_start);

if (!isNaN(exp.job_year_end.substr(0, 1))) {

end_date = new Date(exp.job_year_end);

if (end_date < start_date) {

end_date = new Date();

}

} else {

end_date = new Date();

}

diff_months = (new Date(end_date - start_date))/1000/60/60/24/30;

diff_months = diff_months > 0 ? Math.round(diff_months) : 0;

if(diff_months<=0){

latest_exp = "";

}

else if (diff_months <12){

latest_exp = diff_months+"个月";

}

else if (diff_months % 12 == 0){

latest_exp = (diff_months/12)+"年";

}

else if (diff_months % 12 != 0){

latest_exp = Math.floor(diff_months/12)+"年"+(diff_months % 12)+"个月";

}

parse_data.latest_exp = latest_exp;

tmp_exp = exp.job_year_start;

}

});

}

Parser51Job.prototype.getEducation_v3 = function($, parse_data) {

// get 教育经历

// 2010/09 -- 2013/06：南京化工职业技术学院 | 信息科学技术 | 大专

// 2010/09 -- 至今：山西大学 | 信息与计算科学 | 硕士

// 2006/09 -- 2010/07：长治学院 | 数学与应用数学 | 本科

var self = this;

var found2 = [];

var found = $('*').filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

if($(this).text().trim().match(/^(教育经历|教育背景)$/g))

return true;

// otherwise search contents

var p = $(this).contents().filter(function() {

if($(this).text().trim().match(/^(教育经历|教育背景)$/g))

return true;

}).last();

// use sub found2

if(p.length > 0)

found2 = p;

}).last();

if(found.length <= 0) {

if(found2.length <= 0)

return;

found = found2;

}

if(found.length <= 0)

return;

// find the section by finding its parent

var sectiontext = found.text().trim();

var p = found;

while(1) {

p = p.parent();

var text = p.text().trim();

if(text != sectiontext)

break;

}

var horizonlayout = false;

var founds = [];

// if p.text is not starting with 教育经历, it means we are going up too far

if(!p.text().trim().match(/^教育经历/)) {

var p = found;

while(1) {

if(p.next().length <= 0) {

if(p.parent().length <=0)

break;

p = p.parent();

continue;

}

var n = p.next();

if(n && n.text().trim().length > 0) {

p = n;

break;

}

// some layout has such format

// <html>

// <body>

// <table>...教育经历..</table>

// <table>...edu 1</table>

// ...

// </body>

// </html>

// OR

// <table>

// <tr><td>2011 /9--2014 /4</td><td>南京理工大学</td><td>计算机技术</td><td>硕士</td><td>some text</td></tr>

// <tr>edu 2 ...</td>

// so we check if next item starting with date range

var insection = null;

founds = n.nextAll().filter(function() {

var v = $(this).text().trim();

// FIXME: somehow the pure text field won't be selected, so we have to select it specifically

var n = (this[0].next && this[0].next.type == 'text') ? this[0].next.data.trim() : '';

// we only add the items in the section

if((v.length <= 0 && n.length <=0) || insection === false)

return false;

if(v.match(/\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)/g)) {

insection = true;

return true;

}

insection = false;

return false;

});

if(founds.length >0) {

p = n;

horizonlayout = true;

break;

}

p = p.parent();

}

// when we reach here, p could be html node when 教育经历 parent node is html

// <h3 class="fc6699cc">教育经历</h3> <div class="resume-preview-dl"> 2011.09 - 至今河海大学电子信息科学与技术硕士<br> 2007.09 - 2011.06 黄山学院电子信息科学与技术本科<br> </div>

// <tbody><tr><td>2009/09 -- 至今：南京理工大学紫金学院 | 计算机科学与技术 | 本科</td></tr></tbody>

if(!horizonlayout) {

// pp records the parent when entering first education

var pp = false;

var inpos = true;

var founds = $('*', p).filter(function() {

var elem = this[0];

if(!elem || elem.type != 'tag') return false;

// text might be a from parent element which contains multiple experience lines,

// but we only need to keep the single element of having date range

if($(this).children().length <=0) return false;

var text = $(this).text().trim();

// once the common pp is defined, if text only line shows up

// we are not in education section anymore

// we element doesn't share same parent, ignore it too

if(pp && (!$(this).parent().is(pp) || (text.length > 0 && /\D+/.test(text)))) {

inpos = false;

}

if(!inpos)

return false;

if(text.match(/^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)[:]/g)) {

if(horizonlayout && !$(this).parent().is(p))

return false;

if(!horizonlayout) {

if($(this).parent().is(p)) horizonlayout = true;

}

pp = $(this).parent();

return true;

}

// save as above but with school together

if(text.match(/^\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2}\s*(.*?)\s*\S+/g)) {

if(horizonlayout && !$(this).parent().is(p))

return false;

if(!horizonlayout) {

if($(this).parent().is(p)) horizonlayout = true;

}

pp = $(this).parent();

return true;

}

});

}

// get multiple line of education

var educations = [], pattern = null, m = null;

founds.each(function(i, el) {

// replace delimiter

$(this).find('br').each(function() {

$(this).replaceWith('@@@@');

})

// replace any tag with delimiter to correct build text

var p = this;

if($(this).children().length > 0) {

$('*', p).each(function() {

var elem = this[0];

if($(this).children() <= 0) {

if($(this).text().length > 0)

$(this).html($(this).html() + " |");

else {

$(this).replaceWith(' | ');

}

})

}

var values = us.compact($(this).text().replace(/(\d+)\s*[\.\/]/g, "$1/").trim().split(/@@@@/g));

// values might have multiple education for horizontal layout

var nv = [];

values.forEach(function(v) {

var n = us.compact(v.split(/(\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})/));

// if more than one education presents, n is larger than 2

// '2011/9--', '2014/4 | 南京理工大学 | 计算机技术 | 硕士

// '2007/9--', '2011/6 | 江苏理工学院 | 计算机科学与技术 | 本科

while(n.length > 0) {

nv = nv.concat(n.shift() + n.shift());

}

})

values = nv;

// the next value could be another education, otherwise ignore it

values = values.filter(function(v) {

if(!(m=v.trim().match(/^(\d{4}\s*[\/\.]\d{1,2}\s*[-]{1,2})/)))

return false;

if(!pattern) {

// remember first date range as pattern. The each following workexp should have same pattern

pattern = resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''));

}

else {

// the max text distance is 0: 1111/11--1111/11 => 1111/11--1111/11

if(stringsim.distance(pattern, resumeutil.padDate(m[0].replace(/\d/g, 1).replace(/\s+/g, ''))) > 0) {

// most likely this is not a new workexp

return false;

}

return true;

});

if(values.length <= 0)

return;

values = values.map(function(v) { return v.trim(); });

educations = us.union(educations, values);

});

parse_data.education = [];

us.unique(us.compact(educations)).forEach(function(v) {

var e = self.parseEducation_v3(v.trim());

if(e)

parse_data.education[parse_data.education.length++] = e;

});

this.updateEducation(parse_data);

}

Parser51Job.prototype.parseEducation_v3 = function(education) {

// make sure 2010.09 - 2014.07 => 2010.09-2014.07 or 2010/09 -- 2013/06: => 2010/09-2013/06:

education = education.replace(/\s*\-+\s*/g, '-').split('|');

if(education.length <= 1) {

education = education.shift().split(' ');

}

education = us.compact(education);

if(!education || education.length <= 1)

return null;

var m, tmp = {}, edu = education.shift().trim();

// example:

// 2010/09-2013/06:南京化工职业技术学院

// 2010.09-2014.07

// 2008/9 - 2012/7 南京理工大学紫金学院计算机科学与技术本科

if(m = edu.match(/\d{4}[\/\.]\d{1,2}\-(.*)?/)) {

var values = m[0].split(/[:\s]/);

var school_years = values.shift().trim().split('-');

tmp.school_year_start = school_years.shift().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');

tmp.school_year_end = school_years.shift().replace(/\//g,'-').replace(/\./g, '-').replace(/\b(\w)\b/g, '0$1');

if(values.length > 0)

tmp.school_name = values.shift().trim();

if(values.length > 0)

tmp.major_name = values.shift().trim();

if(values.length > 0)

tmp.degree_level = values.shift().trim();

// if first doesn't have school name, school name should be second

if(!tmp.school_name || tmp.school_name.length <= 0)

tmp.school_name = education.shift().trim();

}

if(education.length > 0 && !tmp.major_name)

tmp.major_name = education.shift().trim();

if(education.length > 0 && !tmp.degree_level)

tmp.degree_level = education.shift().trim();

// degree level should not be a large string otherwise we are on wrong data

if(!tmp.degree_level || tmp.degree_level.length > 5)

return null;

// ignore major name if degree level is high school

if(tmp.degree_level == '高中')

tmp.major_name = '';

// if we don't have valid degree, ignore it

// if(['高中', '中专', '中技', '大专', '本科', '硕士', '研究生', '博士', 'MBA', 'EMBA']

return tmp;

}

Parser51Job.prototype.updateEducation = function(parse_data) {

var tmp_edu = 0;

parse_data.education.forEach(function(edu){

if(!edu) return;

if (!tmp_edu || (new Date(edu.school_year_start) > new Date(tmp_edu))){

parse_data.highest_degree_level = edu.degree_level;

parse_data.highest_major_name = edu.major_name;

parse_data.highest_school_name = edu.school_name;

tmp_edu = edu.school_year_start;

}

});

}

相关推荐