web_scrap/lib/WPMenu.py

130 lines
5.7 KiB
Python
Raw Normal View History

2023-06-30 23:28:54 +02:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPMenu:
# Constructor
2023-06-30 23:52:56 +02:00
def __init__(self, name="Thread-0", basic=None, canalblog="", wordpress="", logger=None, parser="html.parser", ssl_canalblog=True, ssl_wordpress=True):
2023-06-30 23:28:54 +02:00
self._name = name
self._basic = basic
2023-06-30 23:52:56 +02:00
self._canalblog = canalblog
2023-06-30 23:28:54 +02:00
self._wordpress = wordpress
self._logger = logger
self._parser = parser
self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
2023-07-01 22:16:35 +02:00
self._protocol_wordpress = "https"
self._protocol_canalblog = "https"
2023-06-30 23:28:54 +02:00
self._directory = "backup"
2023-06-30 23:52:56 +02:00
2023-06-30 23:28:54 +02:00
if ssl_wordpress is False:
2023-06-30 23:52:56 +02:00
self._protocol_wordpress = "http"
if ssl_canalblog is False:
self._protocol_canalblog = "http"
2023-07-01 22:16:35 +02:00
self._request_canalblog = requests.Session()
self._request_wordpress = requests.Session()
2023-06-30 23:28:54 +02:00
retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
2023-07-01 22:16:35 +02:00
self._request_canalblog.mount('{0}://'.format(self._protocol_canalblog), HTTPAdapter(max_retries=retries))
self._request_wordpress.mount('{0}://'.format(self._protocol_wordpress), HTTPAdapter(max_retries=retries))
2023-06-30 23:28:54 +02:00
# Destructor
def __del__(self):
2023-06-30 23:52:56 +02:00
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
2023-07-01 21:56:10 +02:00
# Public method
## From file
def fromFile(self, files):
if os.path.exists(files):
with open(files, 'r') as f:
self._logger.info("{0} : File is being processed : {1}".format(self._name, files))
content = f.read()
self._menu(content)
else:
self._logger.error("{0} : File isn't exist : {1}".format(self._name, files))
2023-06-30 23:52:56 +02:00
## Get from URL
2023-07-01 21:56:10 +02:00
def fromUrl(self, canalblog):
2023-06-30 23:52:56 +02:00
try:
2023-07-01 21:56:10 +02:00
o = urlparse(canalblog)
2023-06-30 23:52:56 +02:00
o = o._replace(scheme=self._protocol_canalblog)
i = o.geturl().replace(":///", "://")
2023-07-01 22:16:35 +02:00
page = self._request_canalblog.get(i)
2023-06-30 23:52:56 +02:00
if page.status_code == 200:
2023-07-01 21:56:10 +02:00
self._logger.info("{0} : Page web is being processed : {1}".format(self._name, i))
self._menu(page.content)
2023-06-30 23:52:56 +02:00
else:
self._logger.error("{0} : index didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
2023-07-01 22:16:35 +02:00
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, canalblog, err))
2023-06-30 23:52:56 +02:00
exit(1)
except Exception as err:
2023-07-01 22:16:35 +02:00
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, canalblog, err))
2023-07-01 21:56:10 +02:00
2023-07-09 19:11:05 +02:00
def _getId(self, title, parent):
2023-07-09 15:27:20 +02:00
menu = {"id":"", "type":"", "title": "", "parent":"", "children": []}
exist = False
for i in ["categories", "tags"]:
if exist is False:
try:
params = {"search":title, "per_page":"100"}
page = self._request_wordpress.get("{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), auth=self._basic, params=params)
if page.status_code == 200:
result = page.json()
if len(result) > 0:
menu = {"id":result[0]["id"], "type":i, "title": title, "parent":parent, "children": []}
exist = True
else:
self._logger.error("{0} : {2} didn't get due status code : {1}".format(self._name, page.status_code, i))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, "{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, "{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol_wordpress), err))
return menu
2023-07-01 21:56:10 +02:00
def _menu(self, content):
2023-07-01 22:16:35 +02:00
soup = BeautifulSoup(content, self._parser)
2023-07-09 15:27:20 +02:00
ul = soup.find("ul", id="listsmooth")
2023-07-09 19:11:05 +02:00
menu = list()
2023-07-06 00:42:47 +02:00
parents = list()
2023-07-09 15:27:20 +02:00
for anchor in ul.find_all("li"):
2023-07-09 19:11:05 +02:00
parent = anchor.find("a").get_text().replace(" \xa0", "")
itemMenu = {"id":"", "type":"", "title": parent, "parent": parent, "children":[]}
itemChild = {"id":"", "type":"", "title": parent, "parent": parent}
2023-07-06 00:42:47 +02:00
href = anchor.find("a").get("href")
2023-07-09 19:11:05 +02:00
#menu = self._child(parent, parent)
2023-07-06 00:42:47 +02:00
if href == "#":
2023-07-09 15:27:20 +02:00
li = anchor.find("ul").find_all("li")
2023-07-06 00:42:47 +02:00
for child in li:
a = child.find("a")
2023-07-09 19:11:05 +02:00
self._logger.info("{0} Parent {1} : Child {2}".format(self._name, parent, a.get_text()))
itemChild["title"] = a.get_text()
itemChild["parent"] = parent
itemMenu["children"].append(itemChild)
menu.append(itemMenu)
2023-07-06 00:42:47 +02:00
2023-07-09 19:11:05 +02:00
for i in range(0, len(menu)-1):
self._logger.info("{0} Menu : {1}".format(self._name, menu[i]))
#for j in menu[i]["children"]:
# if menu[i]["title"] == j["title"]:
# del menu[i]