新闻网站提取模块

import json
import requests
from gne import GeneralNewsExtractor

"""
提取新闻正文模块
gne 
安装方式: pip install gne
"""
url = 'https://mbd.baidu.com/newspage/data/landingsuper?context=%7B%22nid%22%3A%22news_10032545455944181533%22%7D&n_type=0&p_from=1'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

response = requests.get(url)
extractor = GeneralNewsExtractor()  # 实例化对象
result = extractor.extract(response.content.decode(), with_body_html=True)  # 传入网页的html 文本
print(json.dumps(result, ensure_ascii=False))