From 3e7689267691f5178accda628632162caa6aca51 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 5 Jun 2023 23:46:57 +0200 Subject: [PATCH 1/6] add wpchange --- lib/WPChange.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 lib/WPChange.py diff --git a/lib/WPChange.py b/lib/WPChange.py new file mode 100644 index 0000000..2cf58e5 --- /dev/null +++ b/lib/WPChange.py @@ -0,0 +1,78 @@ +from bs4 import BeautifulSoup +from urllib.parse import urlparse +import requests, os, logging, re, json + +class WPRemove: + # Constructor + def __init__(self, index_name=1, number_thread=1, logger=None): + self._name = "Thread-{0}".format(index_name) + self._logger = logger + self._number_thread = number_thread + + # Destructor + def __del__(self): + print("{0} : Import finished".format(self._name)) + + + # Public method + + ## from file + + def fromFile(self, files=[], number_thread=1, max_thread=1): + divFiles = int(len(files) / max_thread) + currentRangeFiles = int(divFiles * (number_thread+1)) + firstRange = int(currentRangeFiles - divFiles) + self._logger.debug("{0} : index : {1}".format(self._name,number_thread)) + + self._logger.debug("{0} : first range : {1}".format(self._name,firstRange)) + self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles)) + + for i in range(firstRange, currentRangeFiles): + if os.path.exists(files[i]): + self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i])) + with open(files[i], 'r') as f: + content = f.read() + self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content))) + soup = BeautifulSoup(content, self._parser) + articlebody = soup.find_all("div", class_="articlebody") + self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody))) + if len(articlebody) > 0: + self._addOrUpdatePost(soup) + else: + self._addOrUpdateFeaturedMedia(soup) + + + ## From directory + + def fromDirectory(self, directory="", number_thread=1, max_thread=1): + directory = "{0}/archives".format(directory) + directories = self._getDirectories([], "{0}".format(directory)) + if len(directories) > 0: + files = self._getFiles(directories) + self.fromFile(files, number_thread, max_thread) + else: + self._logger.error("{0} : No files for {1}".format(self._name, directory)) + + + # Private method + + ## Get all files + + def _getFiles(self, item): + files = [] + for i in item: + for j in os.listdir(i): + if os.path.isfile("{0}/{1}".format(i, j)): + files.append("{0}/{1}".format(i, j)) + return files + + + ## Get directories + + def _getDirectories(self, subdirectory, item): + sub = subdirectory + for i in os.listdir(item): + if os.path.isdir("{0}/{1}".format(item, i)): + sub.append("{0}/{1}".format(item, i)) + subdirectory = self._getDirectories(sub, "{0}/{1}".format(item, i)) + return subdirectory From c631909cb6c91ddf84287672a0f84d027943d41e Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 6 Jun 2023 00:22:16 +0200 Subject: [PATCH 2/6] WPchange wip --- lib/WPChange.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lib/WPChange.py b/lib/WPChange.py index 2cf58e5..b6e7b05 100644 --- a/lib/WPChange.py +++ b/lib/WPChange.py @@ -4,10 +4,11 @@ import requests, os, logging, re, json class WPRemove: # Constructor - def __init__(self, index_name=1, number_thread=1, logger=None): + def __init__(self, index_name=1, number_thread=1, logger=None, parser="html.parser"): self._name = "Thread-{0}".format(index_name) self._logger = logger self._number_thread = number_thread + self._parser = parser # Destructor def __del__(self): @@ -17,7 +18,7 @@ class WPRemove: # Public method ## from file - + def fromFile(self, files=[], number_thread=1, max_thread=1): divFiles = int(len(files) / max_thread) currentRangeFiles = int(divFiles * (number_thread+1)) @@ -30,17 +31,8 @@ class WPRemove: for i in range(firstRange, currentRangeFiles): if os.path.exists(files[i]): self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i])) - with open(files[i], 'r') as f: - content = f.read() - self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content))) - soup = BeautifulSoup(content, self._parser) - articlebody = soup.find_all("div", class_="articlebody") - self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody))) - if len(articlebody) > 0: - self._addOrUpdatePost(soup) - else: - self._addOrUpdateFeaturedMedia(soup) - + self._change(files[i]) + ## From directory @@ -76,3 +68,13 @@ class WPRemove: sub.append("{0}/{1}".format(item, i)) subdirectory = self._getDirectories(sub, "{0}/{1}".format(item, i)) return subdirectory + + ## Change path img file + + def _change(self, file): + with open(file, 'r') as f: + content = f.read() + soup = BeautifulSoup(content, self._parser) + img = soup.find_all("img") + + From 16368c13bb04437570f22f47f1ea733d3ef8e0db Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 10 Jun 2023 01:58:08 +0200 Subject: [PATCH 3/6] add WPChange --- import_export_canalblog.py | 30 +++++++++++++++++++++++++++++- lib/WPChange.py | 5 ++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/import_export_canalblog.py b/import_export_canalblog.py index c14d9f7..418df05 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -9,6 +9,13 @@ import argparse, logging, threading from lib.WPImport import WPimport from lib.WPExport import WPExport from lib.WPRemove import WPRemove +from lib.WPChange import WPChange + +def change(index, number, args, logger): + changeWp = WPChange(logger=logger, index_name=index, number_thread=number) + changeWp.fromDirectory(args.directory) + + del changeWp def remove(index, number, args, basic, logger, ssl_wordpress): removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number) @@ -172,7 +179,15 @@ if __name__ == '__main__': export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true") export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") - + change_parser = subparsers.add_parser("change") + change_parser.add_argument("--directory", + default="", + help="Directory") + change_parser.add_argument("--file", + default="", + help="File") + + args = parser.parse_args() @@ -293,4 +308,17 @@ if __name__ == '__main__': wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Thread error for remove : {0}".format(err)) + exit(0) + + if args.command == "change": + if len(args.directory) > 0: + try: + with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: + wait_for = [ ex.submit(change, i, args.parallel, args, logger) for i in range(0, int(args.parallel)) ] + except Exception as err: + logger.error("Thread error for remove : {0}".format(err)) + if len(args.file) > 0: + changeWp = WPChange(logger=logger) + for filei in args.file.split(","): + changeWp.fromFile(filei) exit(0) \ No newline at end of file diff --git a/lib/WPChange.py b/lib/WPChange.py index b6e7b05..255e9c1 100644 --- a/lib/WPChange.py +++ b/lib/WPChange.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse import requests, os, logging, re, json -class WPRemove: +class WPChange: # Constructor def __init__(self, index_name=1, number_thread=1, logger=None, parser="html.parser"): self._name = "Thread-{0}".format(index_name) @@ -76,5 +76,8 @@ class WPRemove: content = f.read() soup = BeautifulSoup(content, self._parser) img = soup.find_all("img") + for i in img: + src = i.get("src") + self._logger.info(src) From 9e7e1b27fd8ea69fd760902b074fede33cf97eaa Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 11 Jun 2023 20:24:22 +0200 Subject: [PATCH 4/6] change WIP test --- lib/WPChange.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/lib/WPChange.py b/lib/WPChange.py index 255e9c1..c08b5f0 100644 --- a/lib/WPChange.py +++ b/lib/WPChange.py @@ -21,14 +21,16 @@ class WPChange: def fromFile(self, files=[], number_thread=1, max_thread=1): divFiles = int(len(files) / max_thread) - currentRangeFiles = int(divFiles * (number_thread+1)) + currentRangeFiles = int(divFiles * (number_thread)) firstRange = int(currentRangeFiles - divFiles) self._logger.debug("{0} : index : {1}".format(self._name,number_thread)) + self._logger.debug("{0} : first range : {1}".format(self._name,firstRange)) self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles)) for i in range(firstRange, currentRangeFiles): + if os.path.exists(files[i]): self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i])) self._change(files[i]) @@ -72,12 +74,27 @@ class WPChange: ## Change path img file def _change(self, file): - with open(file, 'r') as f: - content = f.read() - soup = BeautifulSoup(content, self._parser) - img = soup.find_all("img") - for i in img: - src = i.get("src") - self._logger.info(src) + try: + with open(file, 'r') as f: + content = f.read() + soup = BeautifulSoup(content, self._parser) + img = soup.find_all("img") + for i in img: + src = i.get("src") + o = urlparse(src) + if len(o.netloc) > 0: + self._logger.info("{0} : Change source {1} /dists/img/{2}/{3}".format(self._name, src, o.netloc, o.path)) + content = content.replace(src, "/dists/img/{0}/{1}".format(o.netloc, o.path)) + try: + with open(file, "w") as f: + self._logger.info("{0} : File write : {1}".format(self._name, file)) + f.write(content) + except Exception as ex: + self._logger.error("{0} : Error for write file {1} : {2}".format(self._name, file, ex)) + + except Exception as ex: + self._logger.error("{0} : Error for read file {1} : {2}".format(self._name, file, ex)) + + From 7228911e6859b7cf96b11dc81d3e8df3843c3995 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 13 Jun 2023 00:38:34 +0200 Subject: [PATCH 5/6] add js and css --- lib/WPChange.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/WPChange.py b/lib/WPChange.py index c08b5f0..b12c0c4 100644 --- a/lib/WPChange.py +++ b/lib/WPChange.py @@ -83,8 +83,24 @@ class WPChange: src = i.get("src") o = urlparse(src) if len(o.netloc) > 0: - self._logger.info("{0} : Change source {1} /dists/img/{2}/{3}".format(self._name, src, o.netloc, o.path)) - content = content.replace(src, "/dists/img/{0}/{1}".format(o.netloc, o.path)) + self._logger.info("{0} : Change source image {1} /img/{2}/{3}".format(self._name, src, o.netloc, o.path)) + content = content.replace(src, "/img/{0}/{1}".format(o.netloc, o.path)) + script = soup.find_all("script", {"type": "text/javascript"}) + for i in script: + src = i.get("src") + if src is not None: + o = urlparse(src) + if len(o.netloc) > 0: + self._logger.info("{0} : Change source js {1} /dists/js/{2}/{3}".format(self._name, src, o.netloc, o.path)) + content = content.replace(src, "/dists/js/{0}/{1}".format(o.netloc, o.path)) + link = soup.find_all("link", {"rel": "stylesheet"}) + for i in link: + href = i.get("href") + if href is not None: + o = urlparse(href) + if len(o.netloc) > 0: + self._logger.info("{0} : Change source css {1} /dists/css/{2}/{3}".format(self._name, href, o.netloc, o.path)) + content = content.replace(href, "/dists/css/{0}/{1}".format(o.netloc, o.path)) try: with open(file, "w") as f: self._logger.info("{0} : File write : {1}".format(self._name, file)) From da4db0277a917d23c4d4a98cc0d5edd4cf73de58 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 13 Jun 2023 00:46:18 +0200 Subject: [PATCH 6/6] add img a change --- lib/WPChange.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/WPChange.py b/lib/WPChange.py index b12c0c4..8d2b626 100644 --- a/lib/WPChange.py +++ b/lib/WPChange.py @@ -74,6 +74,7 @@ class WPChange: ## Change path img file def _change(self, file): + ext_img = ["png", "svg", "gif", "jpg", "jpeg"] try: with open(file, 'r') as f: content = f.read() @@ -101,6 +102,17 @@ class WPChange: if len(o.netloc) > 0: self._logger.info("{0} : Change source css {1} /dists/css/{2}/{3}".format(self._name, href, o.netloc, o.path)) content = content.replace(href, "/dists/css/{0}/{1}".format(o.netloc, o.path)) + + a = soup.find_all("a", {"target": "_blank"}) + for i in a: + href = i.get("href") + if href is not None: + o = urlparse(href) + if len(o.netloc) > 0: + ext = o.path.split(".")[len(o.path.split("."))-1] + if ext in ext_img: + self._logger.info("{0} : Change a img {1} /img/{2}/{3}".format(self._name, href, o.netloc, o.path)) + content = content.replace(href, "/img/{0}/{1}".format(o.netloc, o.path)) try: with open(file, "w") as f: self._logger.info("{0} : File write : {1}".format(self._name, file))