爬取今日头条文章

 教程仅供技术研究学习使用,若有侵权,联系本人删除

以 https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418  为例

1: 破解as、cp

使用万能的  alt+F  

爬取今日头条文章

将js代码改写为python代码

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author: zhibo.wang
# E-mail: gm.zhibo.wang@gmail.com
# Date  : 20/07/06 11:36:11
# Desc  :

"""

https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js
i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : ""

"""

import hashlib
import math
import re
import time


def get_as_cp():
    """
    as cp js生成规则
    https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_ae91792.js
    function(i) {
    var e = {};
    e.getHoney = function() {
        var i = Math.floor((new Date).getTime() / 1e3),
        e = i.toString(16).toUpperCase(),
        t = md5(i).toString().toUpperCase();
        if (8 != e.length) return {
            as: "479BB4B7254C150",
            cp: "7E0AC8874BB0985"
        };
        for (var o = t.slice(0, 5), n = t.slice( - 5), a = "", s = 0; 5 > s; s++) a += o[s] + e[s];
        for (var r = "",
        c = 0; 5 > c; c++) r += e[c + 3] + n[c];
        return {
            as: "A1" + a + e.slice( - 3),
            cp: e.slice(0, 3) + r + "E1"
        }
    },
    i.ascp = e
    }
    """
    t = int(math.floor(time.time()))
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS, CP

    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'A1' + s + e[-3:]
    CP = e[0:3] + r + 'E1'
    return AS, CP

max_behot_time 参数 第一页给 0 后面多页 请给请求后返回数据中的 max_behot_time 值

爬取今日头条文章

 

_signature参数 也是最难处理的

全局搜索,打断点找到 生成的文件

爬取今日头条文章

 

 

 

https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js
i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : "";

 

爬取今日头条文章

跟着断点一直走发现最终生成的文件是 VM621 也就是下面这张截图

爬取今日头条文章

 

将此文件内容全部拷贝 写入 sign.js中

const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
window = global;

baseurl = process.argv[2]
cookies = process.argv[3]
ua = process.argv[4]

var document = dom.window.document;
var params = {
    location:{
        hash: "#mid=5954781019",
        host: "www.toutiao.com",
        hostname: "www.toutiao.com",
        href: "https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418",
        origin: "https://www.toutiao.com",
        pathname: "/c/user/59672551416/#mid=1566273643580418",
        port: "",
        protocol: "https:",
        search: "",
    },
    navigator:{
        appCodeName: "Mozilla",
        appName: "Netscape",
        appVersion: "5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
        cookieEnabled: true,
        deviceMemory: 8,
        doNotTrack: null,
        hardwareConcurrency: 4,
        language: "zh-CN",
        languages: ["zh-CN", "zh"],
        maxTouchPoints: 0,
        onLine: true,
        platform: "Win32",
        product: "Gecko",
        productSub: "20030107",
        userAgent:  ua,
        vendor: "Google Inc.",
        vendorSub: "",
    },
    "screen":{
        availHeight: 1040,
        availLeft: 0,
        availTop: 0,
        availWidth: 1920,
        colorDepth: 24,
        height: 1080,
        pixelDepth: 24,
         1920,
    }
};
Object.assign(window,params);


function setCookie(name, value, seconds) {
    seconds = seconds || 0;
    var expires = "";
    if (seconds != 0 ) {
    var date = new Date();
    date.setTime(date.getTime()+(seconds*1000));
    expires = "; expires="+date.toGMTString();
    }
    document.cookie = name+"="+escape(value)+expires+"; path=/";
}

//cookies = "csrftoken=a6f078a275e9f39b0addfb9df37fd890; tt_webid=6856639657595241992; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5;tasessionId=ownu4mas91596435834562; ttcid=1de8f696daab43dc8eb818a02408bd6930; tt_scid=P.PuhA.5OslBUeRVIYUAYFS--vw9l9LTWpc4-b4r7prsBwQ2X6extVf1PCjkhCNWc102"
for(let cookie of cookies.split(";")){
    tmp = cookie.split("=");
    setCookie(tmp[0],tmp[1],1800);
}
window.document = document;


//将拷贝的内容放在这里


window.byted_acrawler && window.byted_acrawler.init({
    aid: 24,
    dfp: true,
  })




//sign = window.byted_acrawler.sign({url:"https://www.toutiao.com/api/pc/media_hot/?media_id=1566273643580418&user_id=59672551416"});
sign = window.byted_acrawler.sign({url:baseurl});
console.log(sign);

请安装jsdom     npm i -g jsdom

#!/usr/bin/env python
# -*- coding:utf-8 -*-


import os
import time
import math
import hashlib
import requests

def getHoney():
    i = math.floor(time.time())
    e = str('%X' % i)
    md5 = hashlib.md5()
    md5.update(str(i).encode('utf-8'))
    t = str(md5.hexdigest()).upper()
    if 8 != len(e):
        return {
            'as':"479BB4B7254C150",
            'cp':"7E0AC8874BB0985"
        }
    o = t[0:5]
    n = t[-5:]
    a = ''
    r = ''
    for i in range(5):
        a += o[i] + e[i]
        r += e[i + 3] + n[i]
    return {
        'as':"A1" + a + e[-3:],
        'cp':e[0:3] + r + "E1"
    }

def get_signature(url, cookies, ua):
    sign = os.popen('node sign.js {url} {cookies} {ua}'.format(
        url=url,
        cookies=cookies,
        ua=ua)
    ).read()
    return "&_signature=" + sign

if __name__ == '__main__':
    headers = {
        'Referer':'https://www.toutiao.com/',
        'authority': 'www.toutiao.com',
        'method': 'GET',
        'path': '/c/user/59672551416/',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': 'csrftoken=a6f078a275e9f39b0addfb9df37fd890; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5; ttcid=1de8f696daab43dc8eb818a02408bd6930; SLARDAR_WEB_ID=c7f55d5c-4dba-493d-a126-ce8e36b472bf; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6856656984092460551; tt_webid=6856656984092460551; __tasessionId=61hz1rirw1596442527425; tt_scid=UD3a5jP-6nL7yUaAawB2lLtCdtv430T-TJyynultVAGY6J4cY6KXTiH1QRWAYhb9e1f5',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }
    base_url = 'https://www.toutiao.com/toutiao'
    param = '/c/user/article/?page_type=1&user_id=59672551416&max_behot_time=0&count=20&as={as}&cp={cp}'.format(**getHoney())
    base_url += param
    signature = get_signature(
        base_url,
        headers["cookie"],
        headers["user-agent"]
    )
    path = param + signature
    headers['path'] = path
    url = base_url + signature
    print(url)
    response = requests.get(url=url,headers=headers)
    print(response.text)

python test.py

爬取今日头条文章