From bf4c2480f82733c4b421977c5b6fcdb87da3256b Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 27 Apr 2023 00:00:53 +0200 Subject: [PATCH] import threading for directory WIP --- import_export_canalblog.py | 83 ++++++++++++++++++++++---------------- lib/WPImport.py | 18 ++++++++- 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index e2e8a99..0066654 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -10,7 +10,6 @@ from lib.WPExport import WPExport def download(name_thread, max_thread, url, logger, parser, directory, html, img): -#def download(args): exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) @@ -21,6 +20,46 @@ def download(name_thread, max_thread, url, logger, parser, directory, html, img) if args.img is False: exportWp.downloadImg(webpage) +def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial): + canalblog = canalblog.split(",") + wordpress = wordpress.split(",") + name = "Thread-{0}".format(int(name_thread) + 1) + + if serial is False: + for canal in canalblog: + try: + o = urlparse(canal) + o = o._replace(scheme="https") + url = o.geturl().replace(":///", "://") + except Exception as err: + logger.error("{0} : parsing error : {1}".format(name, err)) + exit(1) + exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser) + webpage = exportWp.getUrlPage(name_thread, max_thread) + for j in wordpress: + importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser) + importWp.fromUrl(webpage) + else: + if len(canalblog) != len(wordpress): + logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name)) + exit(1) + for i in range(0, len(canalblog)-1): + try: + o = urlparse(canalblog[i]) + o = o._replace(scheme="https") + url = o.geturl().replace(":///", "://") + except Exception as err: + logger.error("parsing error : {0}".format(err)) + exit(1) + exportWp = WPExport(name=name, url=url, logger=logger, parser=parser) + webpage = exportWp.getUrlPage(name_thread, max_thread) + importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) + importWp.fromUrl(webpage) + + +def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic): + name = "Thread-{0}".format(int(name_thread) + 1) + importWp = WPimport(name=name, basic=basic, wordpress=wordpress, logger=logger, parser=parser) @@ -112,40 +151,14 @@ if __name__ == '__main__': importWp.fromDirectory(directory[i]) exit(0) if len(args.canalblog) > 0: - exportWp = WPExport("", logger, args.parser, args.directory) - canalblog = args.canalblog.split(",") - wordpress = args.wordpress.split(",") - - if args.serial is False: - for canal in canalblog: - try: - o = urlparse(canal) - o = o._replace(scheme="https") - url = o.geturl().replace(":///", "://") - except Exception as err: - logger.error("parsing error : {0}".format(err)) - exit(1) - exportWp.setUrl(url) - webpage = exportWp.getUrlPage() - for j in wordpress: - importWp.setUrl(j) - importWp.fromUrl(webpage) - else: - if len(canalblog) != len(wordpress): - logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress") - exit(1) - for i in range(0, len(canalblog)-1): - try: - o = urlparse(canalblog[i]) - o = o._replace(scheme="https") - url = o.geturl().replace(":///", "://") - except Exception as err: - logger.error("parsing error : {0}".format(err)) - exit(1) - exportWp.setUrl(url) - webpage = exportWp.getUrlPage() - importWp.setUrl(wordpress[i]) - importWp.fromUrl(webpage) + try: + with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: + wait_for = [ + ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial) + for i in range(0, int(args.parallel)) + ] + except Exception as err: + logger.error("Threading error : {0}".format(err)) diff --git a/lib/WPImport.py b/lib/WPImport.py index d925f67..8980ad5 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -8,7 +8,8 @@ from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor - def __init__(self, basic, wordpress, logger, parser): + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser"): + self._name = name self._basic = basic self._wordpress = wordpress self._logger = logger @@ -69,6 +70,21 @@ class WPimport: # Private method + ## From files in split by thread + + def fromFile(self, files): + for i in range(0, len(files)): + if os.path.exists(file): + self._logger.info("Fichier en cours de traitement : {0}".format(files[i])) + with open(file, 'r') as f: + content = f.read() + soup = BeautifulSoup(content, self._parser) + articlebody = soup.find_all("div", class_="articlebody") + if len(articlebody) > 0: + self._addOrUpdatePost(soup) + else: + self._addOrUpdateFeaturedMedia(soup) + ## Get all files def _getFiles(self, item):