From 2e21040196d80bb65a5b865caba1ca74b0d97f75 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 29 Aug 2023 22:26:15 +0200 Subject: [PATCH 1/6] add private method get info album --- lib/WPImport.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/WPImport.py b/lib/WPImport.py index e28cf1a..f1627ae 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -139,6 +139,24 @@ class WPimport: self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err)) return author + def _getInfoAlbum(self, link): + self._logger.info("{0} : Info album : {1}".format(self._name, link)) + link_o = urlparse(link) + if len(link_o.netloc) > 0: + self._logger.info("{0} : get album info from web : {1}".format(self._name, link_o)) + try: + response = self._request.get(link) + if response.status_code == 200: + self._logger.info("{0} : get content info from web : {1}".format(self._name, link_o)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for get album info : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for get album info : {1}".format(self._name, err)) + exit(1) + else: + self._logger.info("{0} : get album info from file : {1}".format(self._name, link_o)) + def _addOrUpdateAlbum(self, soup): self._logger.info("{0} : Add/Update Album".format(self._name)) albumbody = soup.find("div", class_="albumbody") From 2279e4b0b6b4993821b1128a946b074e0cde442c Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Wed, 30 Aug 2023 22:39:59 +0200 Subject: [PATCH 2/6] search title album 50% --- import_export_canalblog.py | 2 +- lib/WPImport.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 78fdb0f..d5d2a18 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -170,7 +170,7 @@ if __name__ == '__main__': import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true") import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true") import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true") - import_parser.add_argument("--author-album", dest=author, help="Define author for page album", default="") + import_parser.add_argument("--author-album", dest="author", help="Define author for page album", default="") diff --git a/lib/WPImport.py b/lib/WPImport.py index f1627ae..14d8c81 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -148,6 +148,7 @@ class WPimport: response = self._request.get(link) if response.status_code == 200: self._logger.info("{0} : get content info from web : {1}".format(self._name, link_o)) + page_img = response.content except ConnectionError as err: self._logger.error("{0} : Connection error for get album info : {1}".format(self._name, err)) exit(1) @@ -156,6 +157,13 @@ class WPimport: exit(1) else: self._logger.info("{0} : get album info from file : {1}".format(self._name, link_o)) + if os.path.exists("{0}/..{1}".format(self._directory, link_o)): + page_img = open("{0}/..{1}".format(self._directory, link_o), "r") + soup = BeautifulSoup(content, self._parser) + paragraph = soup.find("div", class_="albumbody").find("p") + author = paragraph.split("
")[1].split(":")[1].replace(" ", "").lower() + return author + def _addOrUpdateAlbum(self, soup): self._logger.info("{0} : Add/Update Album".format(self._name)) @@ -213,8 +221,14 @@ class WPimport: data = {"title":albumtitle, "content":content_html, "status":"publish"} if len(self._author) > 0: author = self._getAuthor(self._author) - if author != 0: - data = {"title":albumtitle, "content":content_html, "status":"publish", "author":author} + else: + link_a = albumbody.find_all("a") + href_a = link_a[0].get("href", "/") + author = self._getInfoAlbum(href_a) + self._logger.info("{0} : author : {1}".format(self._name, author)) + self._getAuthor(author) + if author != 0: + data = {"title":albumtitle, "content":content_html, "status":"publish", "author":author} self._logger.debug("{0} : data for album page : {1}".format(self._name, data)) for index in range(1,10): params = {"search": albumtitle, "per_page":100, "page": index} From e5109204aa792beaff14c706c73e7024fa9423a9 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Wed, 30 Aug 2023 23:45:16 +0200 Subject: [PATCH 3/6] get link with album --- lib/WPImport.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index 14d8c81..b3e5d80 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -223,7 +223,9 @@ class WPimport: author = self._getAuthor(self._author) else: link_a = albumbody.find_all("a") - href_a = link_a[0].get("href", "/") + for i in link_a: + if re.search(r"/albums/", i.get("href", "/")): + href_a = i.get("href", "/") author = self._getInfoAlbum(href_a) self._logger.info("{0} : author : {1}".format(self._name, author)) self._getAuthor(author) From 7b154e3a1dbae8896f82c2dbb6a3a333daeadc40 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 31 Aug 2023 22:50:31 +0200 Subject: [PATCH 4/6] add author --- import_export_canalblog.py | 2 +- lib/WPImport.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index d5d2a18..eca7e72 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -259,7 +259,7 @@ if __name__ == '__main__': basic = HTTPBasicAuth(args.user, password) if args.command == "import": wordpress = args.wordpress.split(",") - importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author) + importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author, ssl_canalblog=ssl_canalblog) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) diff --git a/lib/WPImport.py b/lib/WPImport.py index b3e5d80..56c51a7 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor - def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author=""): + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author="", ssl_canalblog=True): self._name = name self._basic = basic self._wordpress = wordpress @@ -20,7 +20,7 @@ class WPimport: if ssl_wordpress is False: self._protocol = "http" self._request = requests.Session() - + self._ssl_canalblog = ssl_canalblog retries = Retry(connect=10, read=10, redirect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) @@ -140,6 +140,8 @@ class WPimport: return author def _getInfoAlbum(self, link): + if self._ssl_canalblog: + link = link.replace("http", "https") self._logger.info("{0} : Info album : {1}".format(self._name, link)) link_o = urlparse(link) if len(link_o.netloc) > 0: @@ -147,7 +149,7 @@ class WPimport: try: response = self._request.get(link) if response.status_code == 200: - self._logger.info("{0} : get content info from web : {1}".format(self._name, link_o)) + self._logger.info("{0} : get content info from web : {1}".format(self._name, link)) page_img = response.content except ConnectionError as err: self._logger.error("{0} : Connection error for get album info : {1}".format(self._name, err)) @@ -159,9 +161,15 @@ class WPimport: self._logger.info("{0} : get album info from file : {1}".format(self._name, link_o)) if os.path.exists("{0}/..{1}".format(self._directory, link_o)): page_img = open("{0}/..{1}".format(self._directory, link_o), "r") - soup = BeautifulSoup(content, self._parser) + soup = BeautifulSoup(page_img, self._parser) paragraph = soup.find("div", class_="albumbody").find("p") - author = paragraph.split("
")[1].split(":")[1].replace(" ", "").lower() + self._logger.info("{0} get paragraph : {1}".format(self._name, paragraph)) + split_paragraph = str(paragraph).split("
") + self._logger.info("{0} length paragraph splitted : {1}".format(self._name, len(split_paragraph))) + if len(split_paragraph) == 1: + split_paragraph = str(paragraph).split("
") + self._logger.info("{0} get paragraph splitted : {1}".format(self._name, split_paragraph)) + author = split_paragraph[1].split(":")[1].replace(" ", "").lower() return author @@ -226,6 +234,7 @@ class WPimport: for i in link_a: if re.search(r"/albums/", i.get("href", "/")): href_a = i.get("href", "/") + break author = self._getInfoAlbum(href_a) self._logger.info("{0} : author : {1}".format(self._name, author)) self._getAuthor(author) From 963f83ae81cf8735d68ceff5e50e5849e7e10da1 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 2 Sep 2023 00:26:50 +0200 Subject: [PATCH 5/6] fix author --- lib/WPImport.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index 56c51a7..d0240a9 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -237,7 +237,9 @@ class WPimport: break author = self._getInfoAlbum(href_a) self._logger.info("{0} : author : {1}".format(self._name, author)) - self._getAuthor(author) + author = self._getAuthor(author) + data = {"title":albumtitle, "content":content_html, "status":"publish"} + if author != 0: data = {"title":albumtitle, "content":content_html, "status":"publish", "author":author} self._logger.debug("{0} : data for album page : {1}".format(self._name, data)) @@ -849,7 +851,9 @@ class WPimport: self._logger.error("{0} : Exception error for post {1} : {2}".format(self._name, i, err)) title = articletitle[0].text - author = articleacreator[0].text.lower() + author = articleacreator[0].text.lower() + if len(self._author) > 0: + author = self._author body = articlebody[0].find_all("p") bodyhtml = "

" for i in body: From 279a9f27860aab733eaa592081e040e412d44df5 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 2 Sep 2023 00:27:18 +0200 Subject: [PATCH 6/6] fix parameter author --- import_export_canalblog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index eca7e72..ce2068e 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -170,7 +170,7 @@ if __name__ == '__main__': import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true") import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true") import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true") - import_parser.add_argument("--author-album", dest="author", help="Define author for page album", default="") + import_parser.add_argument("--author", dest="author", help="Define author", default="")