From 3c2f1cc0170f1e33b04744efa78b1c5f7768e17b Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 7 May 2023 17:38:44 +0200 Subject: [PATCH] separate publication and principal --- import_export_canalblog.py | 23 +++++++++++++++-------- lib/WPExport.py | 21 +++++++++++++-------- lib/WPImport.py | 30 ++++++++++++++---------------- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index c931a5f..c6cc592 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -14,13 +14,13 @@ from lib.WPExport import WPExport def download(name_thread, max_thread, url, logger, parser, directory, html, img): exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) webpage = exportWp.getUrlPage(name_thread, max_thread) - if html is False: - exportWp.downloadHTML(webpage["article"]) - exportWp.downloadHTML(webpage["page"]) + for i in ["article", "page"]: + for j in ["publications", "principal"]: + if html is False: + exportWp.downloadHTML(webpage[j][i]) - if img is False: - exportWp.downloadImg(webpage["article"]) - exportWp.downloadImg(webpage["page"]) + if img is False: + exportWp.downloadImg(webpage[j][i]) @@ -44,7 +44,10 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas del exportWp for j in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser) - importWp.fromUrl(webpage) + for k in ["article", "page"]: + for l in ["publications", "principal"]: + importWp.fromUrl(webpage[l][k]) + del importWp else: if len(canalblog) != len(wordpress): @@ -62,7 +65,11 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas webpage = exportWp.getUrlPage(name_thread, max_thread) del exportWp importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) - importWp.fromUrl(webpage) + + for k in ["article", "page"]: + for l in ["publications", "principal"]: + importWp.fromUrl(webpage[l][k]) + del importWp diff --git a/lib/WPExport.py b/lib/WPExport.py index b817837..d6b2ee9 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -85,9 +85,14 @@ class WPExport: self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) - webpage = {"page":[], "article":[]} - + webpage = {"principal": {"page":[], "article":[]}, "publications": {"page":[], "article":[]}} for i in page_url: + section = "publications" + o = urlparse(i) + o = o._replace(scheme="https") + i = o.geturl().replace(":///", "://") + if i == "{0}/".format(self._url): + section = "principal" try: page = self._request.get(i) except Exception as err: @@ -95,8 +100,8 @@ class WPExport: exit(1) if page.status_code == 200: self._logger.info("{0} : page : {1}".format(self._name, i)) - if i not in webpage["page"]: - webpage["page"].append(i) + if i not in webpage[section]["page"]: + webpage[section]["page"].append(i) soup = BeautifulSoup(page.text, self._parser) class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") if len(class_div) > 0: @@ -122,22 +127,22 @@ class WPExport: if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) self._logger.info("{0} : {1}".format(self._name, url_paging)) - if url_paging not in webpage["page"]: - webpage["page"].append(url_paging) + if url_paging not in webpage[section]["page"]: + webpage[section]["page"].append(url_paging) page = self._request.get(url_paging) if page.status_code == 200: soup = BeautifulSoup(page.text, self._parser) h2 = soup.find_all("h2") for title in h2: href = title.find_all("a")[0].get("href", "/") - if href not in webpage["article"]: + if href not in webpage[section]["article"]: try: o = urlparse(href) o = o._replace(scheme="https").geturl() except Exception as err: self._logger.error("parsing error : {0}".format(err)) exit(1) - webpage["article"].append(o) + webpage[section]["article"].append(o) else: self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) diff --git a/lib/WPImport.py b/lib/WPImport.py index 6f82ec4..3badd98 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -512,22 +512,20 @@ class WPimport: if len(result) > 0: for i in result: self._logger.debug("{0} : Data for post to delete : {1}".format(self._name, i)) - if i["title"]["rendered"] == title: - self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) - post_id = i["id"] - try: - params = {"force":1} - page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) - except Exception as err: - self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) - - else: - self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) + self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) + post_id = i["id"] + try: + params = {"force":1} + page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) + except Exception as err: + self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) + else: + self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: self._logger.error("{0} : Connection for delete post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content))