From f9be6770e3581ddb44db8cf411749a57a6560b9c Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 7 May 2023 09:26:48 +0200 Subject: [PATCH] separate article and page --- .gitignore | 2 +- import_export_canalblog.py | 22 ++++++++++++++-------- lib/WPExport.py | 20 +++++++++++--------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index a73c961..f82fde7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ backup*/ wp-navigation -web_scrap.log +*.log __pycache__/ diff --git a/import_export_canalblog.py b/import_export_canalblog.py index b44ab9c..c931a5f 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -3,23 +3,27 @@ from requests.auth import HTTPBasicAuth from getpass import getpass from urllib.parse import urlparse from concurrent import futures +from concurrent.futures import as_completed, wait import argparse, logging, threading from lib.WPImport import WPimport from lib.WPExport import WPExport + def download(name_thread, max_thread, url, logger, parser, directory, html, img): - - exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) - + exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) webpage = exportWp.getUrlPage(name_thread, max_thread) if html is False: - exportWp.downloadHTML(webpage) - - if args.img is False: - exportWp.downloadImg(webpage) - del exportWp + exportWp.downloadHTML(webpage["article"]) + exportWp.downloadHTML(webpage["page"]) + + if img is False: + exportWp.downloadImg(webpage["article"]) + exportWp.downloadImg(webpage["page"]) + + + def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial): canalblog = canalblog.split(",") @@ -198,6 +202,8 @@ if __name__ == '__main__': if args.css is False: exportWp.downloadCss() del exportWp + + if args.html is False or args.img is False: try: diff --git a/lib/WPExport.py b/lib/WPExport.py index 3bc4b6d..b817837 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -13,6 +13,7 @@ class WPExport: self._dir = directory self._name = name + self._request = requests.Session() retries = Retry(total=10, @@ -27,6 +28,7 @@ class WPExport: # Public method + # Set name def setName(self, name): @@ -83,8 +85,8 @@ class WPExport: self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) + webpage = {"page":[], "article":[]} - webpage = [] for i in page_url: try: page = self._request.get(i) @@ -93,8 +95,8 @@ class WPExport: exit(1) if page.status_code == 200: self._logger.info("{0} : page : {1}".format(self._name, i)) - if i not in webpage: - webpage.append(i) + if i not in webpage["page"]: + webpage["page"].append(i) soup = BeautifulSoup(page.text, self._parser) class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") if len(class_div) > 0: @@ -120,27 +122,27 @@ class WPExport: if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) self._logger.info("{0} : {1}".format(self._name, url_paging)) - if url_paging not in webpage: - webpage.append(url_paging) + if url_paging not in webpage["page"]: + webpage["page"].append(url_paging) page = self._request.get(url_paging) if page.status_code == 200: soup = BeautifulSoup(page.text, self._parser) h2 = soup.find_all("h2") for title in h2: href = title.find_all("a")[0].get("href", "/") - if href not in webpage: + if href not in webpage["article"]: try: o = urlparse(href) o = o._replace(scheme="https").geturl() except Exception as err: self._logger.error("parsing error : {0}".format(err)) exit(1) - webpage.append(o) + webpage["article"].append(o) else: self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) - self._logger.debug(page.content) - + self._logger.debug("{0} : {1}".format(self._name, page.content)) return webpage + # Private method