From 9ed08ea964f2725812236f3897f2d9c5c976e2b7 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 9 Apr 2023 21:45:51 +0200 Subject: [PATCH] add parameter parser --- WPExport.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/WPExport.py b/WPExport.py index 87571d7..6fd0b8a 100644 --- a/WPExport.py +++ b/WPExport.py @@ -4,9 +4,10 @@ from urllib.parse import urlparse import requests, os, argparse, logging class WPExport: - def __init__(self, url, logger): + def __init__(self, url, logger, parser): self._url = url self._logger = logger + self._parser = parser def _mkdirPath(self, path_dir, logger): if not os.path.exists(path_dir): @@ -34,7 +35,7 @@ class WPExport: exit(1) page_url = [] if page.status_code == 200: - soup = BeautifulSoup(page.text, 'html.parser') + soup = BeautifulSoup(page.text, self._parser) if js is True: script = soup.find_all("script") for anchor in script: @@ -78,7 +79,7 @@ class WPExport: self._logger.error("Connection error : {0}".format(err)) exit(1) if page.status_code == 200: - soup = BeautifulSoup(page.text, 'html.parser') + soup = BeautifulSoup(page.text, self._parser) img = soup.find_all("img") self._logger.info("image from page: {0} : ".format(i)) for anchor in img: @@ -97,7 +98,7 @@ class WPExport: exit(1) page_url = [] if page.status_code == 200: - soup = BeautifulSoup(page.text, 'html.parser') + soup = BeautifulSoup(page.text, self._parser) ul = soup.find_all("ul", id="listsmooth") for anchor in ul[0].find_all("a"): href = anchor.get('href', '/') @@ -115,7 +116,7 @@ class WPExport: self._logger.info("page : {0}".format(i)) if i not in webpage: webpage.append(i) - soup = BeautifulSoup(page.text, 'html.parser') + soup = BeautifulSoup(page.text, self._parser) class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") if len(class_div) > 0: pagingfirstline = class_div[0].find_all("a") @@ -135,7 +136,7 @@ class WPExport: webpage.append(url_paging) page = requests.get(url_paging) if page.status_code == 200: - soup = BeautifulSoup(page.text, 'html.parser') + soup = BeautifulSoup(page.text, self._parser) h2 = soup.find_all("h2") for title in h2: href = title.find_all("a")[0].get("href", "/")