import requests
import threading
import pymysql
# 定义爬取图片的函数
def get_img(url):
r = requests.get(url)
with open('img.jpg', 'wb') as f:
f.write(r.content)
# 定义发布文章的函数
def post_article(title, content):
# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='123456', db='wordpress', charset='utf8')
cursor = conn.cursor()
# 插入文章
sql = "INSERT INTO wp_posts (post_author, post_date, post_date_gmt, post_content, post_title, post_status, comment_status, ping_status, post_name, post_modified, post_modified_gmt, post_type) VALUES (1, now(), now(), %s, %s, 'publish', 'open', 'open', %s, now(), now(), 'post')"
cursor.execute(sql, (content, title, title))
post_id = cursor.lastrowid
# 插入图片
sql = "INSERT INTO wp_posts (post_author, post_date, post_date_gmt, post_content, post_title, post_status, comment_status, ping_status, post_name, post_modified, post_modified_gmt, post_type) VALUES (1, now(), now(), '', 'img.jpg', 'inherit', 'open', 'open', 'img.jpg', now(), now(), 'attachment')"
cursor.execute(sql)
img_id = cursor.lastrowid
# 更新图片的post_parent
sql = "UPDATE wp_posts SET post_parent = %s WHERE ID = %s"
cursor.execute(sql, (post_id, img_id))
# 提交
conn.commit()
# 关闭连接
cursor.close()
conn.close()
# 定义爬取图片并发布文章的函数
def spider_and_post(url, title, content):
# 创建线程
t1 = threading.Thread(target=get_img, args=(url,))
t2 = threading.Thread(target=post_article, args=(title, content))
# 启动线程
t1.start()
t2.start()
# 等待线程结束
t1.join()
t2.join()
if __name__ == '__main__':
url = 'http://www.example.com/img.jpg'
title = '文章标题'
content = '文章内容'
spider_and_post(url, title, content)
不通过数据库的方式
import requests
import threading
import time
from bs4 import BeautifulSoup
# 定义一个全局变量,用来存放爬取到的图片地址
img_urls = []
# 定义一个函数,用来爬取图片
def get_img_urls(url):
# 请求网页
response = requests.get(url)
# 解析网页
soup = BeautifulSoup(response.text, 'lxml')
# 找到所有的图片
imgs = soup.find_all('img')
# 遍历图片,获取图片的地址
for img in imgs:
img_url = img['src']
# 将图片地址添加到全局变量中
img_urls.append(img_url)
# 定义一个函数,用来发布文章
def post_article(img_urls):
# 定义文章标题
title = '爬取图片并发布到wordpress文章'
# 定义文章内容
content = '<h1>爬取图片并发布到wordpress文章</h1>'
# 遍历图片地址,拼接文章内容
for img_url in img_urls:
content += '<img src="{}">'.format(img_url)
# 发布文章
# 这里省略发布文章的代码
# 定义一个函数,用来启动多线程
def main():
# 定义要爬取的网页地址
urls = ['http://www.example.com/1.html', 'http://www.example.com/2.html', 'http://www.example.com/3.html']
# 创建线程列表
threads = []
# 遍历网页地址,创建并启动线程
for url in urls:
t = threading.Thread(target=get_img_urls, args=(url,))
threads.append(t)
t.start()
# 等待所有线程执行完毕
for t in threads:
t.join()
# 发布文章
post_article(img_urls)
if __name__ == '__main__':
main()