web_scrap/insert_wordpress.py

71 lines
2.6 KiB
Python
Raw Normal View History

2023-03-23 23:28:57 +01:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from requests.auth import HTTPBasicAuth
from getpass import getpass
import requests, os, argparse, logging
if __name__ == '__main__':
2023-03-23 23:49:42 +01:00
tags = []
2023-03-23 23:28:57 +01:00
month = {"janvier":"01", "février": "02", "mars": "03", "avril":"04", "mai": "05", "juin": "06", "juillet": "07", "août": "08", "septembre": "09", "octobre": "10", "novembre": "11", "décembre": "12"}
parser = argparse.ArgumentParser()
parser.add_argument("--user", help="wordpress user", required=True)
parser.add_argument("--file", help="HTML file", required=True)
args = parser.parse_args()
password = getpass()
if len(password) == 0:
print("No password error !!! ")
exit(1)
basic = HTTPBasicAuth(args.user, password)
2023-03-23 23:49:42 +01:00
page = requests.get("http://localhost:8080/wp-json/wp/v2/tags")
if page.status_code == 200:
tags = page.json()
print(tags)
2023-03-23 23:28:57 +01:00
with open(args.file, 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'html.parser')
articletitle = soup.find_all("h2", class_="articletitle")
articlebody = soup.find_all("div", class_="articlebody")
articledate = soup.find_all("span", class_="articledate")
dateheader = soup.find_all("div", class_="dateheader")
itemfooter = soup.find_all("div", class_="itemfooter")
a = itemfooter[0].find_all("a", {"rel": True})
tag = []
for i in a:
rel = i.get("rel")
if rel[0] == 'tag':
tag.append(i.text)
2023-03-23 23:49:42 +01:00
listtag = []
for i in tag:
tag_exist = False
for j in tags:
if j["name"] == i:
tag_exist = True
listtag.append(j["id"])
if tag_exist is False:
data = {"name": i}
page = requests.post("http://localhost:8080/wp-json/wp/v2/tags", auth=basic, data=data)
if page.status_code == 201:
result = page.json()
listtag.append(result["id"])
2023-03-23 23:28:57 +01:00
title = articletitle[0].text
body = articlebody[0]
hour = articledate[0].text
time = dateheader[0].text.split(" ")
2023-03-23 23:49:42 +01:00
data = {"title":title, "content":body, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listtag}
print(data)
2023-03-23 23:28:57 +01:00
exit(0)
page = requests.post("http://localhost:8080/wp-json/wp/v2/posts", auth=basic, data=data)
print(page.status_code)
if page.status_code == 201:
print(page.content)