web_scrap/lib/WPMenu.py

359 lines
20 KiB
Python
Raw Normal View History

2023-06-30 23:28:54 +02:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPMenu:
# Constructor
2023-06-30 23:52:56 +02:00
def __init__(self, name="Thread-0", basic=None, canalblog="", wordpress="", logger=None, parser="html.parser", ssl_canalblog=True, ssl_wordpress=True):
2023-06-30 23:28:54 +02:00
self._name = name
self._basic = basic
2023-06-30 23:52:56 +02:00
self._canalblog = canalblog
2023-06-30 23:28:54 +02:00
self._wordpress = wordpress
self._logger = logger
self._parser = parser
self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
2023-07-01 22:16:35 +02:00
self._protocol_wordpress = "https"
self._protocol_canalblog = "https"
2023-06-30 23:28:54 +02:00
self._directory = "backup"
2023-06-30 23:52:56 +02:00
2023-06-30 23:28:54 +02:00
if ssl_wordpress is False:
2023-06-30 23:52:56 +02:00
self._protocol_wordpress = "http"
if ssl_canalblog is False:
self._protocol_canalblog = "http"
2023-07-01 22:16:35 +02:00
self._request_canalblog = requests.Session()
self._request_wordpress = requests.Session()
2023-06-30 23:28:54 +02:00
retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
2023-07-01 22:16:35 +02:00
self._request_canalblog.mount('{0}://'.format(self._protocol_canalblog), HTTPAdapter(max_retries=retries))
self._request_wordpress.mount('{0}://'.format(self._protocol_wordpress), HTTPAdapter(max_retries=retries))
2023-06-30 23:28:54 +02:00
# Destructor
def __del__(self):
2023-06-30 23:52:56 +02:00
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
2023-07-01 21:56:10 +02:00
# Public method
## From file
def fromFile(self, files):
if os.path.exists(files):
with open(files, 'r') as f:
self._logger.info("{0} : File is being processed : {1}".format(self._name, files))
content = f.read()
self._menu(content)
else:
self._logger.error("{0} : File isn't exist : {1}".format(self._name, files))
2023-06-30 23:52:56 +02:00
## Get from URL
2023-07-01 21:56:10 +02:00
def fromUrl(self, canalblog):
2023-07-11 00:17:24 +02:00
self._canalblog = canalblog
2023-06-30 23:52:56 +02:00
try:
2023-07-01 21:56:10 +02:00
o = urlparse(canalblog)
2023-06-30 23:52:56 +02:00
o = o._replace(scheme=self._protocol_canalblog)
i = o.geturl().replace(":///", "://")
2023-07-01 22:16:35 +02:00
page = self._request_canalblog.get(i)
2023-06-30 23:52:56 +02:00
if page.status_code == 200:
2023-07-01 21:56:10 +02:00
self._logger.info("{0} : Page web is being processed : {1}".format(self._name, i))
self._menu(page.content)
2023-06-30 23:52:56 +02:00
else:
self._logger.error("{0} : index didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
2023-07-01 22:16:35 +02:00
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, canalblog, err))
2023-06-30 23:52:56 +02:00
exit(1)
except Exception as err:
2023-07-13 23:53:54 +02:00
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, canalblog, err))
2023-07-15 13:55:01 +02:00
## replace caracter
def _replaceCaracter(self, title_rendered):
list_replace = {'’': "'", '–': '-', '…': '...', '« ': '"', ' »': '"', '« ': '"', ' »': '"', '’': "'", '"‘': "'"}
for old, new in list_replace.items():
title_rendered = title_rendered.replace(old, new)
return title_rendered
2023-07-15 11:51:15 +02:00
def _getIdfromTitlePost(self, content):
2023-07-18 22:09:20 +02:00
idMenu = {"id":0, "type":"", "link":""}
2023-07-15 13:55:01 +02:00
soup = BeautifulSoup(content, self._parser)
2023-07-17 23:20:47 +02:00
articletitle = soup.find("h2", class_="articletitle").get_text()
exist = False
for index in range(1,10):
if exist is False:
params = {"search":articletitle, "per_page":100, "page":index}
try:
self._logger.debug("{0} : Get Url for post : {1} {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol_wordpress), params))
page = self._request_wordpress.get("{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol_wordpress), auth=self._basic, params=params)
if page.status_code == 200:
result = page.json()
self._logger.info("{0} : Get content post : {1}".format(self._name, len(result)))
if len(result) > 0:
for i in result:
title_rendered = i["title"]["rendered"]
if len(articletitle) != len(title_rendered):
title_rendered = self._replaceCaracter(title_rendered)
self._logger.debug("{0} : comparaison debug {1} {2}".format(self._name, articletitle, title_rendered))
if articletitle == title_rendered:
self._logger.debug("{0} : get post id : {1}".format(self._name, i))
2023-07-18 22:09:20 +02:00
idMenu = {"id":i["id"], "type":"post", "link": i["link"]}
2023-07-17 23:20:47 +02:00
exist = True
else:
self._logger.debug("{0} : {2} {1}".format(self._name, result, len(result)))
break
elif page.status_code == 400:
self._logger.debug("{0} : {2} {1}".format(self._name, page.content, page.status_code))
break
else:
self._logger.error("{0} : Post didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get content : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get content : {1} ".format(self._name, err))
2023-07-15 11:51:15 +02:00
return idMenu
2023-07-13 23:53:54 +02:00
def _getIdFromPost(self, href):
2023-07-18 22:09:20 +02:00
idMenu = {"id":0, "type":"", "link":""}
2023-07-15 11:51:15 +02:00
o = urlparse(href)
if len(o.netloc) > 0:
try:
page = self._request_canalblog.get(href)
if page.status_code == 200:
self._logger.info("{0} : Get content : {1}".format(self._name, href))
2023-07-17 23:20:47 +02:00
idMenu = self._getIdfromTitlePost(page.content)
2023-07-15 11:51:15 +02:00
else:
2023-07-17 23:20:47 +02:00
self._logger.error("{0} : {2} didn't get due status code : {1}".format(self._name, page.status_code, href))
2023-07-15 11:51:15 +02:00
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, href, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, href, err))
else:
if os.path.exists("{0}/..{1}".format(self._directory, o.path)):
try:
content = open("{0}/..{1}".format(self._directory, o.path), "r")
2023-07-17 23:20:47 +02:00
idMenu = self._getIdfromTitlePost(content)
2023-07-15 11:51:15 +02:00
except Exception as err:
self._logger.error("{0} : Exception error for get file content {1} : {2}".format(self._name, href, err))
2023-07-13 23:53:54 +02:00
return idMenu
2023-07-15 11:51:15 +02:00
2023-07-13 23:53:54 +02:00
def _getIdFromReverse(self, title, href):
self._logger.info("{0} : get title {1} from href {2}".format(self._name, title, href))
2023-07-18 22:09:20 +02:00
idMenu = {"id":0, "type":"", "link":""}
2023-07-13 23:53:54 +02:00
if href != "#":
title = href[::-1]
link = title.split("/")[0]
link = link[::-1]
title = title.split("/")[1]
title = title[::-1]
self._logger.info("{0} link {1} title {2}".format(self._name, link, title))
if link == "index.html":
idMenu = self._getId(title)
2023-07-15 14:00:23 +02:00
else:
idMenu = self._getIdFromPost(href)
2023-07-15 13:55:01 +02:00
2023-07-13 23:53:54 +02:00
return idMenu
2023-07-01 21:56:10 +02:00
2023-07-11 21:42:50 +02:00
def _getId(self, title):
2023-07-18 22:09:20 +02:00
idMenu = {"id": 0, "type":"", "link":""}
2023-07-09 15:27:20 +02:00
exist = False
2023-07-17 23:20:47 +02:00
if exist is False:
for i in ["categories", "tags"]:
2023-07-20 00:04:11 +02:00
typeId = "category"
if i == "tags":
typeId = "tag"
2023-07-17 23:20:47 +02:00
for index in range(1,10):
2023-07-13 00:25:14 +02:00
try:
params = {"search":title, "per_page":"100", "page":index}
2023-07-13 22:35:38 +02:00
self._logger.info("{0} Get menu {1} {2} {3}".format(self._name, "{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), index, title))
2023-07-13 00:25:14 +02:00
page = self._request_wordpress.get("{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), auth=self._basic, params=params)
if page.status_code == 200:
result = page.json()
if len(result) > 0:
for j in result:
2023-07-13 22:35:38 +02:00
self._logger.info("{0} info : {1} {2} {3}".format(self._name, j["name"], j["slug"], title))
if j["name"] == title or j["slug"] == title:
self._logger.info("{0} : comparaison ok : {1} {2}".format(self._name, j["id"], i))
2023-07-20 00:04:11 +02:00
idMenu = {"id": j["id"], "type": typeId, "link": j["link"]}
2023-07-13 00:25:14 +02:00
exist = True
2023-07-17 23:20:47 +02:00
else:
break
2023-07-15 13:55:01 +02:00
elif page.status_code == 400:
break
2023-07-13 00:25:14 +02:00
else:
self._logger.error("{0} : {2} didn't get due status code : {1}".format(self._name, page.status_code, i))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, "{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, "{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), err))
2023-07-11 21:42:50 +02:00
return idMenu
2023-07-01 21:56:10 +02:00
def _menu(self, content):
2023-07-01 22:16:35 +02:00
soup = BeautifulSoup(content, self._parser)
2023-07-09 15:27:20 +02:00
ul = soup.find("ul", id="listsmooth")
2023-07-09 19:11:05 +02:00
menu = list()
2023-07-09 21:42:09 +02:00
children = list()
2023-07-09 15:27:20 +02:00
for anchor in ul.find_all("li"):
2023-07-09 19:11:05 +02:00
parent = anchor.find("a").get_text().replace(" \xa0", "")
2023-07-06 00:42:47 +02:00
href = anchor.find("a").get("href")
2023-07-11 00:17:24 +02:00
if href == "{0}://{1}/".format(self._protocol_canalblog, self._canalblog):
2023-07-09 21:42:09 +02:00
parent = "home"
2023-07-18 22:09:20 +02:00
itemMenu = {"id":"", "type":"", "title": parent, "link":"", "href":href, "children":list()}
2023-07-06 00:42:47 +02:00
if href == "#":
2023-07-09 15:27:20 +02:00
li = anchor.find("ul").find_all("li")
2023-07-06 00:42:47 +02:00
for child in li:
a = child.find("a")
2023-07-13 22:35:38 +02:00
href = a.get("href")
2023-07-09 19:11:05 +02:00
self._logger.info("{0} Parent {1} : Child {2}".format(self._name, parent, a.get_text()))
2023-07-18 22:09:20 +02:00
children.append({"title": a.get_text(), "parent": parent, "href":href, "link":""})
2023-07-09 19:11:05 +02:00
menu.append(itemMenu)
2023-07-06 00:42:47 +02:00
2023-07-11 21:39:41 +02:00
for i in range(0, len(children)):
self._logger.info("{0} : Child {1} {2}".format(self._name, children[i], i))
for j in range(0, len(menu)):
if j < len(menu):
if menu[j]["title"] == children[i]["title"]:
self._logger.info("{0} : Parent {1} {2}".format(self._name, menu[j], j))
del menu[j]
for j in range(0, len(menu)):
self._logger.info("{0} : Children for : {1}".format(self._name, menu[j]["title"]))
2023-07-09 21:42:09 +02:00
if menu[j]["title"] == children[i]["parent"]:
2023-07-18 22:09:20 +02:00
menu[j]["children"].append({"id":"", "type":"", "title":children[i]["title"], "parent": children[i]["parent"], "link":"", "href":children[i]["href"]})
2023-07-09 21:42:09 +02:00
2023-07-11 21:42:50 +02:00
for i in range(0, len(menu)):
self._logger.info("{0} : Menu {1} {2}".format(self._name, menu[i]["title"], len(menu[i]["children"])))
2023-07-13 00:09:18 +02:00
if menu[i]["title"] != "home":
for j in range(0, len(menu[i]["children"])):
idMenu = self._getId(menu[i]["children"][j]["title"])
2023-07-17 23:20:47 +02:00
if idMenu["id"] == 0:
2023-07-13 23:53:54 +02:00
self._logger.debug("{0} : content children {1}".format(self._name, menu[i]["children"][j]))
idMenu = self._getIdFromReverse(menu[i]["children"][j]["title"], menu[i]["children"][j]["href"])
2023-07-17 23:20:47 +02:00
if idMenu["id"] != 0:
2023-07-18 22:09:20 +02:00
menu[i]["children"][j] = {"id":idMenu["id"], "type": idMenu["type"], "link": idMenu["link"], "title": menu[i]["children"][j]["title"], "parent": menu[i]["children"][j]["parent"]}
2023-07-13 00:09:18 +02:00
idMenu = self._getId(menu[i]["title"])
2023-07-13 23:53:54 +02:00
self._logger.debug("{0} : content parent {1}".format(self._name, menu[i]))
2023-07-18 22:09:20 +02:00
self._logger.debug("{0} : content idMenu {1}".format(self._name, idMenu))
2023-07-17 23:20:47 +02:00
if idMenu["id"] == 0:
2023-07-13 23:53:54 +02:00
idMenu = self._getIdFromReverse(menu[i]["title"], menu[i]["href"])
2023-07-17 23:20:47 +02:00
if idMenu["id"] != 0:
2023-07-18 22:09:20 +02:00
menu[i] = {"id":idMenu["id"], "type": idMenu["type"], "title":menu[i]["title"], "link":idMenu["link"], "children": menu[i]["children"]}
2023-07-24 22:43:52 +02:00
self._createMenu(menu)
2023-07-24 23:32:20 +02:00
2023-07-24 22:43:52 +02:00
exit(0)
2023-07-24 23:32:20 +02:00
def _createItemMenu(self, idMenu, itemMenu, parent):
idItemMenu = 0
self._logger.info("{0} : Create item menu from API Wordpress : {1}".format(self._name, self._wordpress))
try:
params = {"search": itemMenu["title"], "menus": idMenu}
page = self._request_wordpress.get("{1}://{0}/wp-json/wp/v2/menu-items".format(self._wordpress, self._protocol_wordpress), auth=self._basic, params=params)
if page.status_code == 200:
result = page.json()
for i in result:
if i["title"]["rendered"] == itemMenu["title"]:
idItemMenu = i["id"]
self._logger.info("{0} : Length of result for item menus : {1}".format(self._name, len(result)))
2023-07-24 23:51:41 +02:00
url = "{1}://{0}/wp-json/wp/v2/menu-items".format(self._wordpress, self._protocol_wordpress)
if idItemMenu != 0:
url = "{1}://{0}/wp-json/wp/v2/menu-items/{2}".format(self._wordpress, self._protocol_wordpress, idItemMenu)
try:
objectt = itemMenu["type"]
if objectt == "tag":
objectt = "post_tag"
data = {"title": itemMenu["title"], "status": "publish", "parent":parent, "menus":idMenu, "object":objectt, "object_id":itemMenu["id"]}
page = self._request_wordpress.post(url, auth=self._basic, headers=self._headers_json, data=json.dumps(data))
if page.status_code == 201:
self._logger.info("{0} : create item menu : {1}".format(self._name, itemMenu["title"]))
elif page.status_code == 200:
self._logger.info("{0} : update item menu : {1}".format(self._name, itemMenu["title"]))
else:
self._logger.error("{0} : Create menu items didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for create item menu {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menu-items".format(self._wordpress, self._protocol_wordpress), err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get item menu {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menu-items".format(self._wordpress, self._protocol_wordpress), err))
2023-07-24 23:32:20 +02:00
else:
self._logger.error("{0} : Get menu items didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get item menus {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menu-items".format(self._wordpress, self._protocol_wordpress), err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get item menus {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menu-items".format(self._wordpress, self._protocol_wordpress), err))
return idItemMenu
2023-07-24 22:43:52 +02:00
def _createMenu(self, menu):
title = "Menu {0}".format(self._wordpress)
self._logger.info("{0} : Create menu from API Wordpress : {1}".format(self._name, title))
try:
params = {"search": title}
page = self._request_wordpress.get("{1}://{0}/wp-json/wp/v2/menus".format(self._wordpress, self._protocol_wordpress), auth=self._basic, params=params)
if page.status_code == 200:
result = page.json()
self._logger.info("{0} : Get content menus : {1}".format(self._name, len(result)))
idMenu = 0
if len(result) == 0:
self._logger.info("{0} : Create menu : {1}".format(self._name, title))
data = {"name": title}
try:
page = self._request_wordpress.post("{1}://{0}/wp-json/wp/v2/menus".format(self._wordpress, self._protocol_wordpress), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
if page.status_code == 201:
result = page.json()
self._logger.debug("{0} : Get menus : {1}".format(self._name, result))
if len(result) > 0:
idMenu = result["id"]
else:
self._logger.error("{0} : Post menu didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for create menu {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menus".format(self._wordpress, self._protocol_wordpress), err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get menu {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menus".format(self._wordpress, self._protocol_wordpress), err))
else:
self._logger.debug("{0} : Get menus : {1}".format(self._name, result))
for i in result:
self._logger.debug("{0} : List menus : {1}".format(self._name, i))
if i["name"] == title:
idMenu = i["id"]
self._logger.info("{0} : Get ID menus : {1}".format(self._name, idMenu))
else:
self._logger.error("{0} : Get menu didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get menu {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menus".format(self._wordpress, self._protocol_wordpress), err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get menu {1} : {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/menus".format(self._wordpress, self._protocol_wordpress), err))