# -*- coding:utf-8 -*-
import requests
import re
import json
from wp_xmlrpc import Client, WordPressPost
from wp_xmlrpc.methods.posts import NewPost
# 爬取网易新闻
def get_news_163():
url = 'http://news.163.com/'
html = requests.get(url).text
# 获取新闻标题
titles = re.findall(r'<div class="titleBar" id=".*?">(.*?)</div>', html, re.S)
# 获取新闻链接
urls = re.findall(r'<div class="titleBar" id=".*?">.*?<a href="(.*?)"', html, re.S)
for title, url in zip(titles, urls):
data = {
'title': title,
'url': url
}
yield data
# 爬取新浪新闻
def get_news_sina():
url = 'https://news.sina.com.cn/'
html = requests.get(url).text
# 获取新闻标题
titles = re.findall(r'<h2><a href="https://news.sina.com.cn/.*?" target="_blank">(.*?)</a></h2>', html, re.S)
# 获取新闻链接
urls = re.findall(r'<h2><a href="(https://news.sina.com.cn/.*?)" target="_blank">', html, re.S)
for title, url in zip(titles, urls):
data = {
'title': title,
'url': url
}
yield data
# 爬取百度新闻
def get_news_baidu():
url = 'http://news.baidu.com/'
html = requests.get(url).text
# 获取新闻标题
titles = re.findall(r'<a href=".*?" target="_blank" mon="ct=1&a=2&c=top&pn=0&">(.*?)</a>', html, re.S)
# 获取新闻链接
urls = re.findall(r'<a href="(.*?)" target="_blank" mon="ct=1&a=2&c=top&pn=0&">', html, re.S)
for title, url in zip(titles, urls):
data = {
'title': title,
'url': url
}
yield data
# 将新闻发布到wordpress
def post_news_wordpress(news):
# 填写WordPress的xmlrpc地址
wp = Client('http://example.com/xmlrpc.php', 'username', 'password')
post = WordPressPost()
post.title = news['title']
post.content = '<a href="{}">{}</a>'.format(news['url'], news['title'])
post.post_status = 'publish'
wp.call(NewPost(post))
if __name__ == '__main__':
for news in get_news_163():
post_news_wordpress(news)
for news in get_news_sina():
post_news_wordpress(news)
for news in get_news_baidu():
post_news_wordpress(news)
python爬取网易、新浪、百度等网站热门新闻并发布到wordpress
版权声明:本文采用知识共享 署名4.0国际许可协议 [BY-NC-SA] 进行授权
文章名称:《python爬取网易、新浪、百度等网站热门新闻并发布到wordpress》
文章链接:https://www.gebizhan.com/1718.html
本站资源仅供个人学习交流,请于下载后24小时内删除,不允许用于商业用途,否则法律问题自行承担。
文章名称:《python爬取网易、新浪、百度等网站热门新闻并发布到wordpress》
文章链接:https://www.gebizhan.com/1718.html
本站资源仅供个人学习交流,请于下载后24小时内删除,不允许用于商业用途,否则法律问题自行承担。