你回来了?
我们一直在隔壁!

python爬取网易、新浪、百度等网站热门新闻并发布到wordpress

# -*- coding:utf-8 -*-

import requests
import re
import json
from wp_xmlrpc import Client, WordPressPost
from wp_xmlrpc.methods.posts import NewPost

# 爬取网易新闻
def get_news_163():
    url = 'http://news.163.com/'
    html = requests.get(url).text
    # 获取新闻标题
    titles = re.findall(r'<div class="titleBar" id=".*?">(.*?)</div>', html, re.S)
    # 获取新闻链接
    urls = re.findall(r'<div class="titleBar" id=".*?">.*?<a href="(.*?)"', html, re.S)
    for title, url in zip(titles, urls):
        data = {
            'title': title,
            'url': url
        }
        yield data

# 爬取新浪新闻
def get_news_sina():
    url = 'https://news.sina.com.cn/'
    html = requests.get(url).text
    # 获取新闻标题
    titles = re.findall(r'<h2><a href="https://news.sina.com.cn/.*?" target="_blank">(.*?)</a></h2>', html, re.S)
    # 获取新闻链接
    urls = re.findall(r'<h2><a href="(https://news.sina.com.cn/.*?)" target="_blank">', html, re.S)
    for title, url in zip(titles, urls):
        data = {
            'title': title,
            'url': url
        }
        yield data

# 爬取百度新闻
def get_news_baidu():
    url = 'http://news.baidu.com/'
    html = requests.get(url).text
    # 获取新闻标题
    titles = re.findall(r'<a href=".*?" target="_blank" mon="ct=1&a=2&c=top&pn=0&">(.*?)</a>', html, re.S)
    # 获取新闻链接
    urls = re.findall(r'<a href="(.*?)" target="_blank" mon="ct=1&a=2&c=top&pn=0&">', html, re.S)
    for title, url in zip(titles, urls):
        data = {
            'title': title,
            'url': url
        }
        yield data

# 将新闻发布到wordpress
def post_news_wordpress(news):
    # 填写WordPress的xmlrpc地址
    wp = Client('http://example.com/xmlrpc.php', 'username', 'password')
    post = WordPressPost()
    post.title = news['title']
    post.content = '<a href="{}">{}</a>'.format(news['url'], news['title'])
    post.post_status = 'publish'
    wp.call(NewPost(post))

if __name__ == '__main__':
    for news in get_news_163():
        post_news_wordpress(news)
    for news in get_news_sina():
        post_news_wordpress(news)
    for news in get_news_baidu():
        post_news_wordpress(news)
赞(0)
版权声明:本文采用知识共享 署名4.0国际许可协议 [BY-NC-SA] 进行授权
文章名称:《python爬取网易、新浪、百度等网站热门新闻并发布到wordpress》
文章链接:https://www.gebizhan.com/1718.html
本站资源仅供个人学习交流,请于下载后24小时内删除,不允许用于商业用途,否则法律问题自行承担。

隔壁评论 抢沙发

快来看啊,隔壁站!

我们就是隔壁站的老王

隔壁邮箱隔壁TG

登录

找回密码

注册