web_scrap/lib/WPMenu.py

61 lines
2.5 KiB
Python
Raw Normal View History

2023-06-30 23:28:54 +02:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPMenu:
# Constructor
2023-06-30 23:52:56 +02:00
def __init__(self, name="Thread-0", basic=None, canalblog="", wordpress="", logger=None, parser="html.parser", ssl_canalblog=True, ssl_wordpress=True):
2023-06-30 23:28:54 +02:00
self._name = name
self._basic = basic
2023-06-30 23:52:56 +02:00
self._canalblog = canalblog
2023-06-30 23:28:54 +02:00
self._wordpress = wordpress
self._logger = logger
self._parser = parser
self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
2023-06-30 23:52:56 +02:00
self._protocol_wordpress, self._protocol_canalblog = "https"
2023-06-30 23:28:54 +02:00
self._directory = "backup"
2023-06-30 23:52:56 +02:00
2023-06-30 23:28:54 +02:00
if ssl_wordpress is False:
2023-06-30 23:52:56 +02:00
self._protocol_wordpress = "http"
if ssl_canalblog is False:
self._protocol_canalblog = "http"
2023-06-30 23:28:54 +02:00
self._request = requests.Session()
retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
2023-06-30 23:52:56 +02:00
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
# Public method
## Get from URL
def fromUrl(self):
try:
o = urlparse(self._canalblog)
o = o._replace(scheme=self._protocol_canalblog)
i = o.geturl().replace(":///", "://")
page = self._request.get(i)
if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser)
2023-07-01 00:11:43 +02:00
ul = soup.find_all("ul", id="listsmooth")
for anchor in ul[0].find_all("li"):
li = anchor.find_all("li")
for content in li:
a = content.find("a")
2023-06-30 23:52:56 +02:00
else:
self._logger.error("{0} : index didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))