From 4789fe80aa5d57719563decc3e3279b9f10a6f1e Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 2 May 2023 16:59:31 +0200 Subject: [PATCH 01/19] fix import 50% --- lib/WPImport.py | 152 ++++++++++++++++++++++++------------------------ 1 file changed, 77 insertions(+), 75 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index e3ed2ee..50b58b6 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -18,7 +18,7 @@ class WPimport: self._request = requests.Session() - retries = Retry(total=10, + retries = Retry(connect=10, read=10, redirect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) self._request.mount('http://', HTTPAdapter(max_retries=retries)) @@ -26,7 +26,7 @@ class WPimport: # Destructor def __del__(self): - self._logger.info("{0} : Import finished for {1}".format(self._name, self._wordpress)) + print("{0} : Import finished for {1}".format(self._name, self._wordpress)) # Public method @@ -192,54 +192,60 @@ class WPimport: ## Add or update img def _addOrUpdateMedia(self, href_img, page): + media_authorized = ["png", "jpg", "jpeg", "svg"] media = {"id":"", "rendered":""} split_fileimg = href_img.split("/") img_name = split_fileimg[len(split_fileimg)-1] - self._logger.debug("{0} : Search for image {1} with URL {2}".format(self._name, img_name, "http://{0}/wp-json/wp/v2/media".format(self._wordpress))) - params = { "search": img_name} - try: - r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) - except Exception as err: - self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err)) - exit(1) - self._logger.debug("{0} : Search for image {1} and his status code {2}".format(self._name, img_name, r.status_code)) - if r.status_code == 200: - res = r.json() - if len(res) > 0: - params = {"force":1} + img_type_file = img_name.split(".")[len(img_name.split("."))-1] + is_img = True + if img_type_file not in media_authorized: + self._logger.error("{0} : Element {1} is not image".format(self._name,img_name)) + is_img = False + if is_img is True: + self._logger.debug("{0} : Search for image {1} with URL {2}".format(self._name, img_name, "http://{0}/wp-json/wp/v2/media".format(self._wordpress))) + params = { "search": img_name} + try: + r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) + except Exception as err: + self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err)) + exit(1) + self._logger.debug("{0} : Search for image {1} and his status code {2}".format(self._name, img_name, r.status_code)) + if r.status_code == 200: + res = r.json() + self._logger.debug("{0} : Number of image in search : {1}".format(self._name, len(res))) + if len(res) > 0: + params = {"force":1} + try: + r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params) + except Exception as err: + self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err)) + exit(1) + if r.status_code == 200: + self._logger.info("{0} : Image removed {1}".format(self._name, img_name)) + else: + self._logger.error("{0} : Image {1} not removed due status code : {2}".format(self._name, img_name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) + + data = page.content + img_type = "image/{0}".format(img_type_file) + headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)} try: - r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params) + r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) except Exception as err: - self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err)) + self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err)) exit(1) - if r.status_code == 200: - self._logger.info("{0} : Image removed {1}".format(self._name, img_name)) + if r.status_code == 201: + self._logger.info("{0} : Image added {1}".format(self._name, img_name)) + res = r.json() + media["id"] = res["id"] + media["rendered"] = res["guid"]["rendered"] else: - self._logger.error("{0} : Image not removed due status code : {1}".format(self._name, r.status_code)) + self._logger.error("{0} : Image {1}.{2} not added due status code : {3}".format(self._name, img_name, img_type, r.status_code)) self._logger.debug("{0} : {1}".format(self._name, r.content)) - data = page.content - img_type = "image/png" - if img_name.split(".")[1] == "jpg" or img_name.split(".")[1] == "jpeg": - img_type = "image/jpg" - headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)} - try: - r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) - except Exception as err: - self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err)) - exit(1) - if r.status_code == 201: - self._logger.info("{0} : Image added {1}".format(self._name, img_name)) - res = r.json() - media["id"] = res["id"] - media["rendered"] = res["guid"]["rendered"] else: - self._logger.error("{0} : Image not added due status code : {1}".format(self._name, r.status_code)) - self._logger.debug(r.content) - - else: - self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code)) - self._logger.debug("{0} : {1}".format(self._name, r.content)) + self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) return media @@ -498,45 +504,41 @@ class WPimport: except Exception as err: self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) exit(1) - page_exist = True headers = {'Content-Type': 'application/json', 'Accept':'application/json'} if page.status_code == 200: result = page.json() - if len(result) == 0: - page_exist = False - else: - self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) - post_id = result[0]["id"] - try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data)) - except Exception as err: - self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post updated : {1}".format(self._name, result["title"]["raw"])) - self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) - self._linkImgPost(result["title"]["raw"], list_img, result["id"]) - else: - self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) - + if len(result) > 0: + for i in result: + self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) + post_id = i["id"] + try: + params = {"force":1} + page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) + except Exception as err: + self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) + + else: + self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: - self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) + self._logger.error("{0} : Connection for delete post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) - if page_exist == False: - try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) - except Exception as err: - self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 201: - result = page.json() - self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"])) - self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) - self._linkImgPost(result["title"]["raw"], list_img, result["id"]) - else: - self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code)) - self._logger.debug("{0} : {1}".format(self._name, r.content)) + try: + page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) + except Exception as err: + self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 201: + result = page.json() + self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"])) + self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) + self._linkImgPost(result["title"]["raw"], list_img, result["id"]) + else: + self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) From 21d2f35e6ed2a0a3b19ba3b4fe883bc4c3c4037e Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 4 May 2023 00:47:06 +0200 Subject: [PATCH 02/19] add password parameter and fix post to delete 75% --- import_export_canalblog.py | 12 ++++++++---- lib/WPImport.py | 32 ++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 8250a9e..b44ab9c 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -96,6 +96,7 @@ if __name__ == '__main__': import_parser = subparsers.add_parser("import") import_parser.add_argument("--user", help="wordpress user", required=True) + import_parser.add_argument("--password", help="password wordpress's user", default="") import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--directory", help="HTML directory", default="") import_parser.add_argument("--canalblog", help="URL Canalblog", default="") @@ -103,6 +104,7 @@ if __name__ == '__main__': import_parser.add_argument("--serial", help="Serial execution", action="store_true") + export_parser = subparsers.add_parser("export") export_parser.add_argument("--url", help="canblog URL to be scraping", required=True) @@ -143,10 +145,12 @@ if __name__ == '__main__': logger.addHandler(fileHandler) if args.command == "import": - password = getpass() - if len(password) == 0: - logger.error("No password error !!! ") - exit(1) + password = args.password + if len(args.password) == 0: + password = getpass() + if len(password) == 0: + logger.error("No password error !!! ") + exit(1) basic = HTTPBasicAuth(args.user, password) wordpress = args.wordpress.split(",") diff --git a/lib/WPImport.py b/lib/WPImport.py index 50b58b6..6f82ec4 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -228,6 +228,8 @@ class WPimport: data = page.content img_type = "image/{0}".format(img_type_file) + if img_type_file == "jpg": + img_type = "image/jpeg" headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)} try: r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) @@ -509,21 +511,23 @@ class WPimport: result = page.json() if len(result) > 0: for i in result: - self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) - post_id = i["id"] - try: - params = {"force":1} - page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) - except Exception as err: - self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) + self._logger.debug("{0} : Data for post to delete : {1}".format(self._name, i)) + if i["title"]["rendered"] == title: + self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) + post_id = i["id"] + try: + params = {"force":1} + page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) + except Exception as err: + self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) - else: - self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) + else: + self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: self._logger.error("{0} : Connection for delete post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) From f9be6770e3581ddb44db8cf411749a57a6560b9c Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 7 May 2023 09:26:48 +0200 Subject: [PATCH 03/19] separate article and page --- .gitignore | 2 +- import_export_canalblog.py | 22 ++++++++++++++-------- lib/WPExport.py | 20 +++++++++++--------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index a73c961..f82fde7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ backup*/ wp-navigation -web_scrap.log +*.log __pycache__/ diff --git a/import_export_canalblog.py b/import_export_canalblog.py index b44ab9c..c931a5f 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -3,23 +3,27 @@ from requests.auth import HTTPBasicAuth from getpass import getpass from urllib.parse import urlparse from concurrent import futures +from concurrent.futures import as_completed, wait import argparse, logging, threading from lib.WPImport import WPimport from lib.WPExport import WPExport + def download(name_thread, max_thread, url, logger, parser, directory, html, img): - - exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) - + exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) webpage = exportWp.getUrlPage(name_thread, max_thread) if html is False: - exportWp.downloadHTML(webpage) - - if args.img is False: - exportWp.downloadImg(webpage) - del exportWp + exportWp.downloadHTML(webpage["article"]) + exportWp.downloadHTML(webpage["page"]) + + if img is False: + exportWp.downloadImg(webpage["article"]) + exportWp.downloadImg(webpage["page"]) + + + def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial): canalblog = canalblog.split(",") @@ -198,6 +202,8 @@ if __name__ == '__main__': if args.css is False: exportWp.downloadCss() del exportWp + + if args.html is False or args.img is False: try: diff --git a/lib/WPExport.py b/lib/WPExport.py index 3bc4b6d..b817837 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -13,6 +13,7 @@ class WPExport: self._dir = directory self._name = name + self._request = requests.Session() retries = Retry(total=10, @@ -27,6 +28,7 @@ class WPExport: # Public method + # Set name def setName(self, name): @@ -83,8 +85,8 @@ class WPExport: self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) + webpage = {"page":[], "article":[]} - webpage = [] for i in page_url: try: page = self._request.get(i) @@ -93,8 +95,8 @@ class WPExport: exit(1) if page.status_code == 200: self._logger.info("{0} : page : {1}".format(self._name, i)) - if i not in webpage: - webpage.append(i) + if i not in webpage["page"]: + webpage["page"].append(i) soup = BeautifulSoup(page.text, self._parser) class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") if len(class_div) > 0: @@ -120,27 +122,27 @@ class WPExport: if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) self._logger.info("{0} : {1}".format(self._name, url_paging)) - if url_paging not in webpage: - webpage.append(url_paging) + if url_paging not in webpage["page"]: + webpage["page"].append(url_paging) page = self._request.get(url_paging) if page.status_code == 200: soup = BeautifulSoup(page.text, self._parser) h2 = soup.find_all("h2") for title in h2: href = title.find_all("a")[0].get("href", "/") - if href not in webpage: + if href not in webpage["article"]: try: o = urlparse(href) o = o._replace(scheme="https").geturl() except Exception as err: self._logger.error("parsing error : {0}".format(err)) exit(1) - webpage.append(o) + webpage["article"].append(o) else: self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) - self._logger.debug(page.content) - + self._logger.debug("{0} : {1}".format(self._name, page.content)) return webpage + # Private method From 3c2f1cc0170f1e33b04744efa78b1c5f7768e17b Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 7 May 2023 17:38:44 +0200 Subject: [PATCH 04/19] separate publication and principal --- import_export_canalblog.py | 23 +++++++++++++++-------- lib/WPExport.py | 21 +++++++++++++-------- lib/WPImport.py | 30 ++++++++++++++---------------- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index c931a5f..c6cc592 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -14,13 +14,13 @@ from lib.WPExport import WPExport def download(name_thread, max_thread, url, logger, parser, directory, html, img): exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) webpage = exportWp.getUrlPage(name_thread, max_thread) - if html is False: - exportWp.downloadHTML(webpage["article"]) - exportWp.downloadHTML(webpage["page"]) + for i in ["article", "page"]: + for j in ["publications", "principal"]: + if html is False: + exportWp.downloadHTML(webpage[j][i]) - if img is False: - exportWp.downloadImg(webpage["article"]) - exportWp.downloadImg(webpage["page"]) + if img is False: + exportWp.downloadImg(webpage[j][i]) @@ -44,7 +44,10 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas del exportWp for j in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser) - importWp.fromUrl(webpage) + for k in ["article", "page"]: + for l in ["publications", "principal"]: + importWp.fromUrl(webpage[l][k]) + del importWp else: if len(canalblog) != len(wordpress): @@ -62,7 +65,11 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas webpage = exportWp.getUrlPage(name_thread, max_thread) del exportWp importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) - importWp.fromUrl(webpage) + + for k in ["article", "page"]: + for l in ["publications", "principal"]: + importWp.fromUrl(webpage[l][k]) + del importWp diff --git a/lib/WPExport.py b/lib/WPExport.py index b817837..d6b2ee9 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -85,9 +85,14 @@ class WPExport: self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) - webpage = {"page":[], "article":[]} - + webpage = {"principal": {"page":[], "article":[]}, "publications": {"page":[], "article":[]}} for i in page_url: + section = "publications" + o = urlparse(i) + o = o._replace(scheme="https") + i = o.geturl().replace(":///", "://") + if i == "{0}/".format(self._url): + section = "principal" try: page = self._request.get(i) except Exception as err: @@ -95,8 +100,8 @@ class WPExport: exit(1) if page.status_code == 200: self._logger.info("{0} : page : {1}".format(self._name, i)) - if i not in webpage["page"]: - webpage["page"].append(i) + if i not in webpage[section]["page"]: + webpage[section]["page"].append(i) soup = BeautifulSoup(page.text, self._parser) class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") if len(class_div) > 0: @@ -122,22 +127,22 @@ class WPExport: if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) self._logger.info("{0} : {1}".format(self._name, url_paging)) - if url_paging not in webpage["page"]: - webpage["page"].append(url_paging) + if url_paging not in webpage[section]["page"]: + webpage[section]["page"].append(url_paging) page = self._request.get(url_paging) if page.status_code == 200: soup = BeautifulSoup(page.text, self._parser) h2 = soup.find_all("h2") for title in h2: href = title.find_all("a")[0].get("href", "/") - if href not in webpage["article"]: + if href not in webpage[section]["article"]: try: o = urlparse(href) o = o._replace(scheme="https").geturl() except Exception as err: self._logger.error("parsing error : {0}".format(err)) exit(1) - webpage["article"].append(o) + webpage[section]["article"].append(o) else: self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) diff --git a/lib/WPImport.py b/lib/WPImport.py index 6f82ec4..3badd98 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -512,22 +512,20 @@ class WPimport: if len(result) > 0: for i in result: self._logger.debug("{0} : Data for post to delete : {1}".format(self._name, i)) - if i["title"]["rendered"] == title: - self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) - post_id = i["id"] - try: - params = {"force":1} - page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) - except Exception as err: - self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) - - else: - self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) + self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) + post_id = i["id"] + try: + params = {"force":1} + page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) + except Exception as err: + self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) + else: + self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: self._logger.error("{0} : Connection for delete post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) From 3d7aa19441dd0bc33aaa201db3d67701167e0d9d Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Fri, 12 May 2023 00:16:58 +0200 Subject: [PATCH 05/19] add update --- lib/WPImport.py | 55 +++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index 3badd98..adfa67f 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse import requests, os, logging, re, json from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry +from slugify import slugify class WPimport: # Constructor @@ -499,48 +500,52 @@ class WPimport: self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(page.content)) - - params = {"search":title} + slug = slugify(title) + params = {"slug":slug} try: page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) except Exception as err: self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) exit(1) headers = {'Content-Type': 'application/json', 'Accept':'application/json'} + page_is_exist = True if page.status_code == 200: result = page.json() - if len(result) > 0: + if len(result) == 0: + page_is_exist = False + else: for i in result: - self._logger.debug("{0} : Data for post to delete : {1}".format(self._name, i)) - self._logger.info("{0} : Page {1} already exist and going to delete".format(self._name, title)) + self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i)) + self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) post_id = i["id"] try: - params = {"force":1} - page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, params=params) + page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data)) except Exception as err: - self._logger.error("{0} : Connection error for delete post : {1}".format(self._name, err)) + self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) exit(1) if page.status_code == 200: result = page.json() - self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) + self._logger.info("{0} : Post updated : {1}".format(self._name, title)) + self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) + self._linkImgPost(result["title"]["raw"], list_img, result["id"]) else: - self._logger.error("{0} : Post not deleted due status code : {1}".format(self._name, page.status_code)) + self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) else: - self._logger.error("{0} : Connection for delete post error with status code : {1}".format(self._name, page.status_code)) + self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) - - try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) - except Exception as err: - self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 201: - result = page.json() - self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"])) - self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) - self._linkImgPost(result["title"]["raw"], list_img, result["id"]) - else: - self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code)) - self._logger.debug("{0} : {1}".format(self._name, r.content)) + if page_is_exist == False: + try: + page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) + except Exception as err: + self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 201: + result = page.json() + self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"])) + self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) + self._linkImgPost(result["title"]["raw"], list_img, result["id"]) + else: + self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) From ece4d78dd8fd77cc708c76a18c3042716d974dba Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 14 May 2023 18:35:36 +0200 Subject: [PATCH 06/19] add remove all --- import_export_canalblog.py | 11 +++++- lib/WPImport.py | 73 ++++++++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index c6cc592..e6b4e28 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -113,6 +113,7 @@ if __name__ == '__main__': import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") + import_parser.add_argument("--remove", help="Remove all articles", action="store_true") @@ -165,12 +166,16 @@ if __name__ == '__main__': basic = HTTPBasicAuth(args.user, password) wordpress = args.wordpress.split(",") - importWp = WPimport(basic, "", logger, args.parser) + importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) importWp.fromFile(files=args.file.split(",")) if len(args.directory) > 0: + if args.remove: + for i in args.wordpress.split(","): + importWp.setUrl(i) + importWp.removeAll() try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ @@ -180,6 +185,10 @@ if __name__ == '__main__': except Exception as err: logger.error("Threading error : {0}".format(err)) if len(args.canalblog) > 0: + if args.remove: + for i in args.wordpress.split(","): + importWp.setUrl(i) + importWp.removeAll() try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ diff --git a/lib/WPImport.py b/lib/WPImport.py index adfa67f..f8d8a35 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -5,7 +5,6 @@ from urllib.parse import urlparse import requests, os, logging, re, json from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry -from slugify import slugify class WPimport: # Constructor @@ -87,6 +86,35 @@ class WPimport: else: self._addOrUpdateFeaturedMedia(soup) + def removeAll(self): + params = {"per_page":100} + try: + self._logger.info("{0} : List posts to remove for url : {1}".format(self._name, self._wordpress)) + + r = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params, headers=self._headers_json) + except Exception as err: + self._logger.error("{0} : Connection error for list post to remove : {1}".format(self._name, err)) + if r.status_code == 200: + result = r.json() + if len(result) > 0: + for i in result: + self._logger.info("{0} : Remove article for url {1} : {2}".format(self._name, self._wordpress, i["title"]["rendered"])) + params = {"force":1} + try: + r = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, i["id"]), auth=self._basic, headers=self._headers_json , params=params) + if r.status_code == 200: + self._logger.info("{0} : Post removed for URL {1} : {2}".format(self._name, self._wordpress, i["title"]["rendered"])) + else: + self._logger.error("{0} : Connection error for post {1} with status code {2}".format(self._name, self._wordpress, i["title"]["rendered"])) + except Exception as err: + self._logger.error("{0} : Connection error for post remove : {1}".format(self._name, err)) + exit(1) + self.removeAll() + else: + self._logger.error("{0} : Error for list to remove due status code {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : Content error : {1}".format(self._name, r.content)) + + # Private method ## Get all files @@ -489,6 +517,7 @@ class WPimport: data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} params = {"search":author} try: + self._logger.info("{0} : Get author : {1}".format(self._name, author)) page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) except Exception as err: self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err)) @@ -499,15 +528,13 @@ class WPimport: else: self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(page.content)) - - slug = slugify(title) - params = {"slug":slug} + + params = {"search": title} try: page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) except Exception as err: self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) exit(1) - headers = {'Content-Type': 'application/json', 'Accept':'application/json'} page_is_exist = True if page.status_code == 200: result = page.json() @@ -515,29 +542,31 @@ class WPimport: page_is_exist = False else: for i in result: - self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i)) - self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) - post_id = i["id"] - try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data)) - except Exception as err: - self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post updated : {1}".format(self._name, title)) - self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) - self._linkImgPost(result["title"]["raw"], list_img, result["id"]) - else: - self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) + if i["title"]["rendered"] == title: + post_id = i["id"] + self._logger.debug("{0} : Data for post to update : {1}".format(self._name, result[0])) + self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) + + try: + page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) + except Exception as err: + self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post updated : {1}".format(self._name, title)) + self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) + self._linkImgPost(result["title"]["raw"], list_img, result["id"]) + else: + self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) if page_is_exist == False: try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) + page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) except Exception as err: self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) exit(1) From ee8674fd595268deda4453c470156c80bef3b135 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 15 May 2023 23:13:55 +0200 Subject: [PATCH 07/19] add remove class --- lib/WPImport.py | 28 ------------------ lib/WPRemove.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 28 deletions(-) create mode 100644 lib/WPRemove.py diff --git a/lib/WPImport.py b/lib/WPImport.py index f8d8a35..6108582 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -86,34 +86,6 @@ class WPimport: else: self._addOrUpdateFeaturedMedia(soup) - def removeAll(self): - params = {"per_page":100} - try: - self._logger.info("{0} : List posts to remove for url : {1}".format(self._name, self._wordpress)) - - r = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params, headers=self._headers_json) - except Exception as err: - self._logger.error("{0} : Connection error for list post to remove : {1}".format(self._name, err)) - if r.status_code == 200: - result = r.json() - if len(result) > 0: - for i in result: - self._logger.info("{0} : Remove article for url {1} : {2}".format(self._name, self._wordpress, i["title"]["rendered"])) - params = {"force":1} - try: - r = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, i["id"]), auth=self._basic, headers=self._headers_json , params=params) - if r.status_code == 200: - self._logger.info("{0} : Post removed for URL {1} : {2}".format(self._name, self._wordpress, i["title"]["rendered"])) - else: - self._logger.error("{0} : Connection error for post {1} with status code {2}".format(self._name, self._wordpress, i["title"]["rendered"])) - except Exception as err: - self._logger.error("{0} : Connection error for post remove : {1}".format(self._name, err)) - exit(1) - self.removeAll() - else: - self._logger.error("{0} : Error for list to remove due status code {1}".format(self._name, r.status_code)) - self._logger.debug("{0} : Content error : {1}".format(self._name, r.content)) - # Private method diff --git a/lib/WPRemove.py b/lib/WPRemove.py new file mode 100644 index 0000000..ad4ab8a --- /dev/null +++ b/lib/WPRemove.py @@ -0,0 +1,75 @@ +#!/usr/bin/python3 + +from bs4 import BeautifulSoup +from urllib.parse import urlparse +import requests, os, logging, re, json +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +class WPRemove: + # Constructor + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None): + self._name = name + self._basic = basic + self._wordpress = wordpress + self._logger = logger + self._headers_json = {'Content-Type': 'application/json', 'Accept':'application/json'} + + self._request = requests.Session() + + retries = Retry(connect=10, read=10, redirect=5, + status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) + + self._request.mount('http://', HTTPAdapter(max_retries=retries)) + + + # Destructor + def __del__(self): + print("{0} : Import finished for {1}".format(self._name, self._wordpress)) + + # Public method + + def setUrl(self, wordpress): + self._wordpress = wordpress + + def cleanPosts(self): + self._removeAll("posts") + + def cleanTags(self): + self._removeAll("tags") + + def cleanCategories(self): + self._removeAll("categories") + + def cleanMedia(self): + self._removeAll("media") + + # Private method + + def _removeAll(self, composant): + params = {"per_page":100} + try: + self._logger.info("{0} : List {2} to remove for url : {1}".format(self._name, self._wordpress, composant)) + + r = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, composant), auth=self._basic, params=params, headers=self._headers_json) + except Exception as err: + self._logger.error("{0} : Connection error for list {1} to remove : {2}".format(self._name, composant, err)) + if r.status_code == 200: + result = r.json() + if len(result) > 0: + for i in result: + self._logger.info("{0} : Remove {2} for url {1} : {3}".format(self._name, self._wordpress, composant, i["title"]["rendered"])) + params = {"force":1} + try: + r = self._request.delete("http://{0}/wp-json/wp/v2/{1}/{2}".format(self._wordpress, composant, i["id"]), auth=self._basic, headers=self._headers_json , params=params) + if r.status_code == 200: + self._logger.info("{0} : Post removed for URL {1} {2} : {3}".format(self._name, self._wordpress, composant, i["title"]["rendered"])) + else: + self._logger.error("{0} : Connection error for post {1} {2} {3} with status code {4}".format(self._name, self._wordpress, composant, i["title"]["rendered"], r.status_code)) + except Exception as err: + self._logger.error("{0} : Connection error for {1} remove : {2}".format(self._name, composant, err)) + exit(1) + self._removeAll(composant) + else: + self._logger.error("{0} : Error for list to remove {1} due status code {2}".format(self._name, composant, r.status_code)) + self._logger.debug("{0} : Content error for {1} : {2}".format(self._name, composant, r.content)) From cfb24bed0e16797c2209f62d753435bce2ceb869 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 15 May 2023 23:21:25 +0200 Subject: [PATCH 08/19] add remove parameters --- import_export_canalblog.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index e6b4e28..4c38ff3 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -113,7 +113,22 @@ if __name__ == '__main__': import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") - import_parser.add_argument("--remove", help="Remove all articles", action="store_true") + import_parser.add_argument("--remove", help="Remove all", action="store_true") + remove_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true") + remove_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") + remove_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") + remove_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true") + + + remove_parser = subparsers.add_parser("remove") + remove_parser.add_argument("--user", help="wordpress user", required=True) + remove_parser.add_argument("--password", help="password wordpress's user", default="") + remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True) + remove_parser.add_argument("--all", help="Remove all (posts, media, tags, categories)", action="store_true") + remove_parser.add_argument("--posts", help="Remove all posts", action="store_true") + remove_parser.add_argument("--categories", help="Remove all categories", action="store_true") + remove_parser.add_argument("--tags", help="Remove all tags", action="store_true") + remove_parser.add_argument("--media", help="Remove all media", action="store_true") From f3cb5c4069c0d1855759b4766aa622873aa8b4ba Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 15 May 2023 23:22:41 +0200 Subject: [PATCH 09/19] fix parameters --- import_export_canalblog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 4c38ff3..81b3e00 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -113,7 +113,7 @@ if __name__ == '__main__': import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") - import_parser.add_argument("--remove", help="Remove all", action="store_true") + import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true") remove_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true") remove_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") remove_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") @@ -124,7 +124,7 @@ if __name__ == '__main__': remove_parser.add_argument("--user", help="wordpress user", required=True) remove_parser.add_argument("--password", help="password wordpress's user", default="") remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True) - remove_parser.add_argument("--all", help="Remove all (posts, media, tags, categories)", action="store_true") + remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true") remove_parser.add_argument("--posts", help="Remove all posts", action="store_true") remove_parser.add_argument("--categories", help="Remove all categories", action="store_true") remove_parser.add_argument("--tags", help="Remove all tags", action="store_true") From 8bdaea391045f1352d370d91945e32c69630792b Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 15 May 2023 23:42:18 +0200 Subject: [PATCH 10/19] add remove command --- import_export_canalblog.py | 44 ++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 81b3e00..97a5bf3 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -8,7 +8,29 @@ from concurrent.futures import as_completed, wait import argparse, logging, threading from lib.WPImport import WPimport from lib.WPExport import WPExport +from lib.WPRemove import WPRemove +def remove(args, basic, logger): + removeWp = WPRemove(basic=basic, wordpress="", logger=logger) + if args.remove: + for i in args.wordpress.split(","): + removeWp.setUrl(i) + removeWp.cleanPosts() + removeWp.cleanTags() + removeWp.cleanCategories() + removeWp.cleanMedia() + else: + for i in args.wordpress.split(","): + removeWp.setUrl(i) + if args.posts: + removeWp.cleanPosts() + if args.categories: + removeWp.cleanCategories() + if args.tags: + removeWp.cleanTags() + if args.media: + removeWp.cleanMedia() + del removeWp def download(name_thread, max_thread, url, logger, parser, directory, html, img): @@ -187,10 +209,7 @@ if __name__ == '__main__': importWp.setUrl(i) importWp.fromFile(files=args.file.split(",")) if len(args.directory) > 0: - if args.remove: - for i in args.wordpress.split(","): - importWp.setUrl(i) - importWp.removeAll() + remove(args, basic, logger) try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ @@ -200,10 +219,7 @@ if __name__ == '__main__': except Exception as err: logger.error("Threading error : {0}".format(err)) if len(args.canalblog) > 0: - if args.remove: - for i in args.wordpress.split(","): - importWp.setUrl(i) - importWp.removeAll() + remove(args, basic, logger) try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ @@ -246,5 +262,15 @@ if __name__ == '__main__': except Exception as err: logger.error("Threading error : {0}".format(err)) - + + if args.command == "remove": + password = args.password + if len(args.password) == 0: + password = getpass() + if len(password) == 0: + logger.error("No password error !!! ") + exit(1) + + basic = HTTPBasicAuth(args.user, password) + remove(args, basic, logger) exit(0) \ No newline at end of file From d18f4e1579f725f607c5c81ba536522a7ae2c283 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 15 May 2023 23:51:45 +0200 Subject: [PATCH 11/19] Add clean --- import_export_canalblog.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 97a5bf3..6a2a3e3 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -12,7 +12,7 @@ from lib.WPRemove import WPRemove def remove(args, basic, logger): removeWp = WPRemove(basic=basic, wordpress="", logger=logger) - if args.remove: + if args.remove == True: for i in args.wordpress.split(","): removeWp.setUrl(i) removeWp.cleanPosts() @@ -22,13 +22,13 @@ def remove(args, basic, logger): else: for i in args.wordpress.split(","): removeWp.setUrl(i) - if args.posts: + if args.posts == True: removeWp.cleanPosts() - if args.categories: + if args.categories == True: removeWp.cleanCategories() - if args.tags: + if args.tags == True: removeWp.cleanTags() - if args.media: + if args.media == True: removeWp.cleanMedia() del removeWp @@ -136,10 +136,10 @@ if __name__ == '__main__': import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true") - remove_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true") - remove_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") - remove_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") - remove_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true") + import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true") + import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") + import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") + import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true") remove_parser = subparsers.add_parser("remove") @@ -193,7 +193,7 @@ if __name__ == '__main__': fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) - if args.command == "import": + if args.command == "import" or args.command == "remove": password = args.password if len(args.password) == 0: password = getpass() @@ -202,6 +202,7 @@ if __name__ == '__main__': exit(1) basic = HTTPBasicAuth(args.user, password) + if args.command == "import": wordpress = args.wordpress.split(",") importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser) if len(args.file) > 0: @@ -261,16 +262,9 @@ if __name__ == '__main__': ] except Exception as err: logger.error("Threading error : {0}".format(err)) + exit(0) - if args.command == "remove": - password = args.password - if len(args.password) == 0: - password = getpass() - if len(password) == 0: - logger.error("No password error !!! ") - exit(1) - - basic = HTTPBasicAuth(args.user, password) - remove(args, basic, logger) + if args.command == "remove": + remove(args, basic, logger) exit(0) \ No newline at end of file From ba42d56be1c6248d19a6b4fb118512fc2d044303 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 16 May 2023 00:15:16 +0200 Subject: [PATCH 12/19] fix webpage --- lib/WPExport.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/WPExport.py b/lib/WPExport.py index d6b2ee9..58827da 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -108,12 +108,14 @@ class WPExport: pagingfirstline = class_div[0].find_all("a") if len(pagingfirstline) > 1: lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") + self._logger.debug("{0} : Last page {1}".format(self._name, lastpage)) + element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] number_page = element_lastpage.split("-")[0].split("p")[1] number_lastpage = int(number_page) / 10 setPageDivided = int(number_lastpage) / max_thread - setPagePart = setPageDivided * (index_thread + 1) + setPagePart = setPageDivided * (index_thread + 1) + 1 firstPagePart = (setPagePart - setPageDivided) self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage))) From 769b7f43fc7f29e7734cd9c44df9d2b7f92aaba4 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 18 May 2023 00:24:41 +0200 Subject: [PATCH 13/19] fix add or update post --- lib/WPExport.py | 10 +++++--- lib/WPImport.py | 61 ++++++++++++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/lib/WPExport.py b/lib/WPExport.py index 58827da..90080b7 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -103,7 +103,7 @@ class WPExport: if i not in webpage[section]["page"]: webpage[section]["page"].append(i) soup = BeautifulSoup(page.text, self._parser) - class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") + class_div = soup.find_all("div", class_="pagingfirstline") if len(class_div) > 0: pagingfirstline = class_div[0].find_all("a") if len(pagingfirstline) > 1: @@ -115,14 +115,16 @@ class WPExport: number_lastpage = int(number_page) / 10 setPageDivided = int(number_lastpage) / max_thread - setPagePart = setPageDivided * (index_thread + 1) + 1 + if setPageDivided > int(setPageDivided): + setPageDivided = setPageDivided + 1 + setPagePart = setPageDivided * (index_thread + 1) firstPagePart = (setPagePart - setPageDivided) self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage))) self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart))) self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart))) - for j in range(int(firstPagePart),int(setPagePart)): + for j in range(int(firstPagePart),int(setPagePart)+1): paging = j * 10 categorie = urlparse(i).path.split("/") url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging) @@ -135,7 +137,9 @@ class WPExport: if page.status_code == 200: soup = BeautifulSoup(page.text, self._parser) h2 = soup.find_all("h2") + self._logger.debug("{0} : {1} H2 : {2}".format(self._name, url_paging, h2)) for title in h2: + self._logger.debug("{0} : {1} a : {2}".format(self._name, url_paging, title.find_all("a"))) href = title.find_all("a")[0].get("href", "/") if href not in webpage[section]["article"]: try: diff --git a/lib/WPImport.py b/lib/WPImport.py index 6108582..8e27a0e 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -489,12 +489,15 @@ class WPimport: data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} params = {"search":author} try: - self._logger.info("{0} : Get author : {1}".format(self._name, author)) + self._logger.info("{0} : Search author : {1}".format(self._name, author)) page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) - except Exception as err: + except ConnectionError as err: self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err)) exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err)) if page.status_code == 200: + self._logger.info("{0} : Get author id : {1}".format(self._name, result[0]["id"])) result = page.json() data["author"] = result[0]["id"] else: @@ -503,45 +506,51 @@ class WPimport: params = {"search": title} try: + self._logger.info("{0} : Search post : {1}".format(self._name, title)) page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) - except Exception as err: + except ConnectionError as err: self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) exit(1) - page_is_exist = True + except Exception as err: + self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err)) + page_is_exist = False if page.status_code == 200: result = page.json() - if len(result) == 0: - page_is_exist = False - else: - for i in result: - if i["title"]["rendered"] == title: - post_id = i["id"] - self._logger.debug("{0} : Data for post to update : {1}".format(self._name, result[0])) - self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) + self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result))) + for i in result: + self._logger.info("{0} : Search title posts for {2} : {1}".format(self._name, i["title"]["rendered"], title)) + if i["title"]["rendered"] == title: + page_is_exist = True + post_id = i["id"] + self._logger.debug("{0} : Data for post to update : {1}".format(self._name, result[0])) + self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) - try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) - except Exception as err: - self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post updated : {1}".format(self._name, title)) - self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) - self._linkImgPost(result["title"]["raw"], list_img, result["id"]) - else: - self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) + try: + page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) + except Exception as err: + self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) + exit(1) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post updated : {1}".format(self._name, title)) + self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) + self._linkImgPost(result["title"]["raw"], list_img, result["id"]) + else: + self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(self._name, page.content)) if page_is_exist == False: try: + self._logger.info("{0} : Creating posts : {1}".format(self._name, data["title"])) page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) - except Exception as err: + except ConnectionError as err: self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for create post : {1}".format(self._name, err)) if page.status_code == 201: result = page.json() self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"])) From 75772ba7f061809ec812a3bc2f4bcbb4d645ebc1 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 21 May 2023 21:12:00 +0200 Subject: [PATCH 14/19] remove doublon --- lib/WPImport.py | 120 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 40 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index 8e27a0e..7b78cad 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse import requests, os, logging, re, json from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry +from slugify import slugify class WPimport: # Constructor @@ -14,7 +15,7 @@ class WPimport: self._wordpress = wordpress self._logger = logger self._parser = parser - self._headers_json = {'Content-Type': 'application/json', 'Accept':'application/json'} + self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'} self._request = requests.Session() @@ -487,7 +488,7 @@ class WPimport: hour = articledate[0].text time = dateheader[0].text.split(" ") data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} - params = {"search":author} + params = {"search":author, "per_page":100} try: self._logger.info("{0} : Search author : {1}".format(self._name, author)) page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) @@ -503,46 +504,85 @@ class WPimport: else: self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(page.content)) - - params = {"search": title} - try: - self._logger.info("{0} : Search post : {1}".format(self._name, title)) - page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) - except ConnectionError as err: - self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) - exit(1) - except Exception as err: - self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err)) + if title[len(title)-1] == " ": + title = title[:-1] page_is_exist = False - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result))) - for i in result: - self._logger.info("{0} : Search title posts for {2} : {1}".format(self._name, i["title"]["rendered"], title)) - if i["title"]["rendered"] == title: - page_is_exist = True - post_id = i["id"] - self._logger.debug("{0} : Data for post to update : {1}".format(self._name, result[0])) - self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) - - try: - page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) - except Exception as err: - self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) - exit(1) - if page.status_code == 200: - result = page.json() - self._logger.info("{0} : Post updated : {1}".format(self._name, title)) - self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) - self._linkImgPost(result["title"]["raw"], list_img, result["id"]) - else: - self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) - else: - self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) - if page_is_exist == False: + for index in range(1,10): + params = {"search": title, "per_page":100, "page": index} + try: + self._logger.info("{0} : Search post : {1}".format(self._name, title)) + page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params, headers=self._headers_json) + except ConnectionError as err: + self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err)) + if page.status_code == 200: + self._logger.debug("{0} : Encoding : {1}".format(self._name, page.encoding)) + page.encoding = "utf-8" + result = page.json() + if len(result) == 0: + break + self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result))) + count = 0 + for i in result: + + self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, i["title"]["rendered"], title)) + title_rendered = i["title"]["rendered"].replace('’', "'") + title_rendered = title_rendered.replace('–', '-') + title_rendered = title_rendered.replace('…', '...') + title_rendered = title_rendered.replace('« ', '"') + title_rendered = title_rendered.replace(' »', '"') + self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered))) + if title_rendered == title: + page_is_exist = True + post_id = i["id"] + count = count + 1 + if count > 1: + self._logger.info("{0} : Page {1} is double and going to delete".format(self._name, title)) + try: + params = {"force":1} + page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, params=params) + except ConnectionError as err: + self._logger.error("{0} : Connection error for deleted post : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for deleted post : {1}".format(self._name, err)) + if page.status_code == 200: + self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) + else: + self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + + else: + self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i)) + self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) + + try: + page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for update post : {1}".format(self._name, err)) + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : Post updated : {1}".format(self._name, title)) + self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) + self._linkImgPost(result["title"]["raw"], list_img, result["id"]) + else: + self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + if page.status_code == 400: + self._logger.error("{0} : Connection for update post unauthorized : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + break + else: + self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + + if page_is_exist is False: try: self._logger.info("{0} : Creating posts : {1}".format(self._name, data["title"])) page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) From 3718b807ba23883f852cda1776fc3eee6312beea Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 21 May 2023 21:14:36 +0200 Subject: [PATCH 15/19] more message debug --- lib/WPImport.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index 7b78cad..6e5ab5b 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -511,7 +511,7 @@ class WPimport: for index in range(1,10): params = {"search": title, "per_page":100, "page": index} try: - self._logger.info("{0} : Search post : {1}".format(self._name, title)) + self._logger.info("{0} : Search post withi index {2} : {1}".format(self._name, title, index)) page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params, headers=self._headers_json) except ConnectionError as err: self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) @@ -527,13 +527,13 @@ class WPimport: self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result))) count = 0 for i in result: - self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, i["title"]["rendered"], title)) title_rendered = i["title"]["rendered"].replace('’', "'") title_rendered = title_rendered.replace('–', '-') title_rendered = title_rendered.replace('…', '...') title_rendered = title_rendered.replace('« ', '"') title_rendered = title_rendered.replace(' »', '"') + self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title)) self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered))) if title_rendered == title: page_is_exist = True From 0fc6e78a183067595387de247dffbd14b1f22010 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 23 May 2023 00:02:51 +0200 Subject: [PATCH 16/19] fix title rendered --- lib/WPImport.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index 6e5ab5b..dbffb0f 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -528,11 +528,17 @@ class WPimport: count = 0 for i in result: self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, i["title"]["rendered"], title)) - title_rendered = i["title"]["rendered"].replace('’', "'") - title_rendered = title_rendered.replace('–', '-') - title_rendered = title_rendered.replace('…', '...') - title_rendered = title_rendered.replace('« ', '"') - title_rendered = title_rendered.replace(' »', '"') + if len(i["title"]["rendered"]) == len(title): + title_rendered = i["title"]["rendered"] + else: + title_rendered = i["title"]["rendered"].replace('’', "'") + title_rendered = title_rendered.replace('–', '-') + title_rendered = title_rendered.replace('…', '...') + title_rendered = title_rendered.replace('« ', '"') + title_rendered = title_rendered.replace(' »', '"') + title_rendered = title_rendered.replace('« ', '"') + title_rendered = title_rendered.replace(' »', '"') + title_rendered = title_rendered.replace('’', "'") self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title)) self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered))) if title_rendered == title: From d3ec7d147d0476e9d26ea5ff76830624037579c6 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 23 May 2023 11:22:37 +0200 Subject: [PATCH 17/19] loop replace --- lib/WPImport.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index dbffb0f..bf04c98 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -506,6 +506,8 @@ class WPimport: self._logger.debug("{0} : {1}".format(page.content)) if title[len(title)-1] == " ": title = title[:-1] + if title[0] == " ": + title = title[1:] page_is_exist = False for index in range(1,10): @@ -531,14 +533,10 @@ class WPimport: if len(i["title"]["rendered"]) == len(title): title_rendered = i["title"]["rendered"] else: - title_rendered = i["title"]["rendered"].replace('’', "'") - title_rendered = title_rendered.replace('–', '-') - title_rendered = title_rendered.replace('…', '...') - title_rendered = title_rendered.replace('« ', '"') - title_rendered = title_rendered.replace(' »', '"') - title_rendered = title_rendered.replace('« ', '"') - title_rendered = title_rendered.replace(' »', '"') - title_rendered = title_rendered.replace('’', "'") + title_rendered = i["title"]["rendered"] + list_replace = {'’': "'", '–': '-', '…': '...', '« ': '"', ' »': '"', '« ': '"', ' »': '"', '’': "'", '"‘': "'"} + for old, new in list_replace.items(): + title_rendered = title_rendered.replace(old, new) self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title)) self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered))) if title_rendered == title: From f69298179ad75ebc0d0e680500d6256560066766 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 23 May 2023 13:45:59 +0200 Subject: [PATCH 18/19] reduce line code and add private method --- lib/WPImport.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index bf04c98..b2af6d8 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -90,6 +90,23 @@ class WPimport: # Private method + ## replace caracter + + def _replaceCaracter(self, title_rendered): + list_replace = {'’': "'", '–': '-', '…': '...', '« ': '"', ' »': '"', '« ': '"', ' »': '"', '’': "'", '"‘': "'"} + for old, new in list_replace.items(): + title_rendered = title_rendered.replace(old, new) + return title_rendered + + ## remove space + + def _removeSpace(self, title): + if title[len(title)-1] == " ": + title = title[:-1] + if title[0] == " ": + title = title[1:] + return title + ## Get all files def _getFiles(self, item): @@ -504,10 +521,7 @@ class WPimport: else: self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(page.content)) - if title[len(title)-1] == " ": - title = title[:-1] - if title[0] == " ": - title = title[1:] + title = self._removeSpace(title) page_is_exist = False for index in range(1,10): @@ -529,14 +543,10 @@ class WPimport: self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result))) count = 0 for i in result: - self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, i["title"]["rendered"], title)) - if len(i["title"]["rendered"]) == len(title): - title_rendered = i["title"]["rendered"] - else: - title_rendered = i["title"]["rendered"] - list_replace = {'’': "'", '–': '-', '…': '...', '« ': '"', ' »': '"', '« ': '"', ' »': '"', '’': "'", '"‘': "'"} - for old, new in list_replace.items(): - title_rendered = title_rendered.replace(old, new) + title_rendered = i["title"]["rendered"] + self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title)) + if len(title_rendered) != len(title): + title_rendered = self._replaceCaracter(title_rendered) self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title)) self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered))) if title_rendered == title: From 5c5dc707f5b7b4c8737a68d0b681ea3f122ece65 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 23 May 2023 16:46:07 +0200 Subject: [PATCH 19/19] fix headers search author --- lib/WPImport.py | 89 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/lib/WPImport.py b/lib/WPImport.py index b2af6d8..ba405ee 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -211,7 +211,7 @@ class WPimport: ## Add or update img def _addOrUpdateMedia(self, href_img, page): - media_authorized = ["png", "jpg", "jpeg", "svg"] + media_authorized = ["png", "jpg", "jpeg", "svg", "gif"] media = {"id":"", "rendered":""} split_fileimg = href_img.split("/") img_name = split_fileimg[len(split_fileimg)-1] @@ -455,33 +455,56 @@ class WPimport: for i in liste: for j in element[i]: element_exist = False - try: - params = {"params":j} - page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params) - except Exception as err: - self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err)) - exit(1) - if page.status_code == 200: - element_exist = True - result = page.json() - listelement[i].append(result[0]["id"]) - - else: - self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code)) - self._logger.debug("{0} : {1}".format(self._name, page.content)) - - - if element_exist is False: - data = {"name": j} - self._logger.debug("{0} : URL : {1} ".format("http://{1}/wp-json/wp/v2/{2}".format(self._name, self._wordpress, i))) - self._logger.debug("{0} : data : {1}".format(self._name, data)) - self._logger.debug("{0} : headers : {1}".format(self._name, self._headers_form)) + title_element = self._removeSpace(j) + for index in range(1,10): + self._logger.info("{0} : search {1} with index {2} : {3}".format(self._name, i, index, title_element)) try: - page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=data) + params = {"search":title_element, "per_page":"100", "page":index} + page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params) + except ConnectionError as err: + self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err)) + exit(1) except Exception as err: + self._logger.error("{0} : Exception error for {1} : {2}".format(self._name, i, err)) + if page.status_code == 200: + result = page.json() + self._logger.debug("{0} : content {3} {2} : {1}".format(self._name, result, title_element, i)) + if len(result) > 0: + for k in result: + title_rendered = k["name"] + self._logger.debug("{0} : content {2} : {1}".format(self._name, title_rendered, i)) + self._logger.debug("{0} : size of content {3} : {2} - {1}".format(self._name, len(title_rendered), len(title_element), i)) + if len(title_element) != len(title_rendered): + title_rendered = self._replaceCaracter(title_rendered) + + if title_element == title_rendered: + self._logger.info("{0} : {1} found : {2}".format(self._name, i, title_rendered)) + element_exist = True + listelement[i].append(k["id"]) + else: + break + if page.status_code == 400: + self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + break + else: + self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + + self._logger.debug("{0} : Element {3} {2} is {1}".format(self._name, element_exist, title_element, i)) + if element_exist == False: + data = {"name": title_element} + self._logger.info("{0} : Create {1} : {2}".format(self._name, i, title_element)) + self._logger.debug("{0} : Data : {1}".format(self._name, data)) + try: + page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) + except ConnectionError as err: self._logger.error("{0} : Connection error for post {1} : {2}".format(self._name, i, err)) exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for post {1} : {2}".format(self._name, i, err)) if page.status_code == 201: + self._logger.info("{0} : {1} created : {2}".format(self._name, i, j)) result = page.json() listelement[i].append(result["id"]) else: @@ -504,30 +527,40 @@ class WPimport: bodyhtml = bodyhtml.replace(i["old_src"], o.path) hour = articledate[0].text time = dateheader[0].text.split(" ") + self._logger.debug("{0} : Title post : |{1}|".format(self._name, title)) + title = self._removeSpace(title) + self._logger.debug("{0} : Rendered Title post : |{1}|".format(self._name, title)) + data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} + self._logger.debug("{0} : Data for post : |{1}| : {2}" .format(self._name, title, data)) + params = {"search":author, "per_page":100} try: self._logger.info("{0} : Search author : {1}".format(self._name, author)) - page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) + page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, headers=self._headers_json, params=params) + self._logger.debug("{0} : End Search author : {1}".format(self._name, author)) + self._logger.debug("{0} : Debug requests : {1}".format(self._name, page.content)) + + except ConnectionError as err: self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err)) exit(1) except Exception as err: self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err)) if page.status_code == 200: - self._logger.info("{0} : Get author id : {1}".format(self._name, result[0]["id"])) + self._logger.info("{0} : Get author id : {1}".format(self._name, result)) result = page.json() - data["author"] = result[0]["id"] + for a in result: + data["author"] = a["id"] else: self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) self._logger.debug("{0} : {1}".format(page.content)) - title = self._removeSpace(title) page_is_exist = False for index in range(1,10): params = {"search": title, "per_page":100, "page": index} try: - self._logger.info("{0} : Search post withi index {2} : {1}".format(self._name, title, index)) + self._logger.info("{0} : Search post with index {2} : {1}".format(self._name, title, index)) page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params, headers=self._headers_json) except ConnectionError as err: self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err))