From 7b154e3a1dbae8896f82c2dbb6a3a333daeadc40 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 31 Aug 2023 22:50:31 +0200 Subject: [PATCH] add author --- import_export_canalblog.py | 2 +- lib/WPImport.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index d5d2a18..eca7e72 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -259,7 +259,7 @@ if __name__ == '__main__': basic = HTTPBasicAuth(args.user, password) if args.command == "import": wordpress = args.wordpress.split(",") - importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author) + importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author, ssl_canalblog=ssl_canalblog) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) diff --git a/lib/WPImport.py b/lib/WPImport.py index b3e5d80..56c51a7 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor - def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author=""): + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author="", ssl_canalblog=True): self._name = name self._basic = basic self._wordpress = wordpress @@ -20,7 +20,7 @@ class WPimport: if ssl_wordpress is False: self._protocol = "http" self._request = requests.Session() - + self._ssl_canalblog = ssl_canalblog retries = Retry(connect=10, read=10, redirect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) @@ -140,6 +140,8 @@ class WPimport: return author def _getInfoAlbum(self, link): + if self._ssl_canalblog: + link = link.replace("http", "https") self._logger.info("{0} : Info album : {1}".format(self._name, link)) link_o = urlparse(link) if len(link_o.netloc) > 0: @@ -147,7 +149,7 @@ class WPimport: try: response = self._request.get(link) if response.status_code == 200: - self._logger.info("{0} : get content info from web : {1}".format(self._name, link_o)) + self._logger.info("{0} : get content info from web : {1}".format(self._name, link)) page_img = response.content except ConnectionError as err: self._logger.error("{0} : Connection error for get album info : {1}".format(self._name, err)) @@ -159,9 +161,15 @@ class WPimport: self._logger.info("{0} : get album info from file : {1}".format(self._name, link_o)) if os.path.exists("{0}/..{1}".format(self._directory, link_o)): page_img = open("{0}/..{1}".format(self._directory, link_o), "r") - soup = BeautifulSoup(content, self._parser) + soup = BeautifulSoup(page_img, self._parser) paragraph = soup.find("div", class_="albumbody").find("p") - author = paragraph.split("
")[1].split(":")[1].replace(" ", "").lower() + self._logger.info("{0} get paragraph : {1}".format(self._name, paragraph)) + split_paragraph = str(paragraph).split("
") + self._logger.info("{0} length paragraph splitted : {1}".format(self._name, len(split_paragraph))) + if len(split_paragraph) == 1: + split_paragraph = str(paragraph).split("
") + self._logger.info("{0} get paragraph splitted : {1}".format(self._name, split_paragraph)) + author = split_paragraph[1].split(":")[1].replace(" ", "").lower() return author @@ -226,6 +234,7 @@ class WPimport: for i in link_a: if re.search(r"/albums/", i.get("href", "/")): href_a = i.get("href", "/") + break author = self._getInfoAlbum(href_a) self._logger.info("{0} : author : {1}".format(self._name, author)) self._getAuthor(author)