diff --git a/.gitignore b/.gitignore index 1dfd775..a73c961 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -backup/ -backup1/ +backup*/ +wp-navigation web_scrap.log __pycache__/ diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 6470699..8250a9e 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -2,17 +2,95 @@ from requests.auth import HTTPBasicAuth from getpass import getpass from urllib.parse import urlparse -import argparse, logging +from concurrent import futures + +import argparse, logging, threading from lib.WPImport import WPimport from lib.WPExport import WPExport +def download(name_thread, max_thread, url, logger, parser, directory, html, img): + + exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) + + webpage = exportWp.getUrlPage(name_thread, max_thread) + if html is False: + exportWp.downloadHTML(webpage) + + if args.img is False: + exportWp.downloadImg(webpage) + del exportWp + +def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial): + canalblog = canalblog.split(",") + wordpress = wordpress.split(",") + name = "Thread-{0}".format(int(name_thread) + 1) + + if serial is False: + for canal in canalblog: + try: + o = urlparse(canal) + o = o._replace(scheme="https") + url = o.geturl().replace(":///", "://") + except Exception as err: + logger.error("{0} : parsing error : {1}".format(name, err)) + exit(1) + exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser) + webpage = exportWp.getUrlPage(name_thread, max_thread) + del exportWp + for j in wordpress: + importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser) + importWp.fromUrl(webpage) + del importWp + else: + if len(canalblog) != len(wordpress): + logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name)) + exit(1) + for i in range(0, len(canalblog)-1): + try: + o = urlparse(canalblog[i]) + o = o._replace(scheme="https") + url = o.geturl().replace(":///", "://") + except Exception as err: + logger.error("parsing error : {0}".format(err)) + exit(1) + exportWp = WPExport(name=name, url=url, logger=logger, parser=parser) + webpage = exportWp.getUrlPage(name_thread, max_thread) + del exportWp + importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) + importWp.fromUrl(webpage) + del importWp + + +def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial): + name = "Thread-{0}".format(int(name_thread) + 1) + directory = directory.split(",") + wordpress = wordpress.split(",") + if serial is False: + for i in wordpress: + importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser) + for j in directory: + importWp.fromDirectory(j, name_thread, max_thread) + del importWp + + else: + if len(directory) != len(wordpress): + logger.error("{0} : Error : Number directory is differant than wordpress".format(name)) + exit(1) + for i in range(0, len(wordpress)-1): + importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) + importWp.fromDirectory(directory[i]) + del importWp + + + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--debug", help="Verbosity", action="store_true") parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--parser", help="Parser content", default="html.parser") + parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) subparsers = parser.add_subparsers(dest="command") @@ -76,64 +154,31 @@ if __name__ == '__main__': if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) - importWp.fromFile(args.file.split(",")) - exit(0) + importWp.fromFile(files=args.file.split(",")) if len(args.directory) > 0: - directory = args.directory.split(",") - if args.serial is False: - for i in wordpress: - importWp.setUrl(i) - for j in directory: - importWp.fromDirectory(j) - else: - if len(directory) != len(wordpress): - logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress") - exit(1) - for i in range(0, len(wordpress)-1): - importWp.setUrl(wordpress[i]) - importWp.fromDirectory(directory[i]) - exit(0) + try: + with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: + wait_for = [ + ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial) + for i in range(0, int(args.parallel)) + ] + except Exception as err: + logger.error("Threading error : {0}".format(err)) if len(args.canalblog) > 0: - exportWp = WPExport("", logger, args.parser, args.directory) - canalblog = args.canalblog.split(",") - wordpress = args.wordpress.split(",") - - if args.serial is False: - for canal in canalblog: - try: - o = urlparse(canal) - o = o._replace(scheme="https") - url = o.geturl().replace(":///", "://") - except Exception as err: - logger.error("parsing error : {0}".format(err)) - exit(1) - exportWp.setUrl(url) - webpage = exportWp.getUrlPage() - for j in wordpress: - importWp.setUrl(j) - importWp.fromUrl(webpage) - else: - if len(canalblog) != len(wordpress): - logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress") - exit(1) - for i in range(0, len(canalblog)-1): - try: - o = urlparse(canalblog[i]) - o = o._replace(scheme="https") - url = o.geturl().replace(":///", "://") - except Exception as err: - logger.error("parsing error : {0}".format(err)) - exit(1) - exportWp.setUrl(url) - webpage = exportWp.getUrlPage() - importWp.setUrl(wordpress[i]) - importWp.fromUrl(webpage) - + try: + with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: + wait_for = [ + ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial) + for i in range(0, int(args.parallel)) + ] + except Exception as err: + logger.error("Threading error : {0}".format(err)) + exit(0) if args.command == "export": canalblog = args.url.split(",") - exportWp = WPExport("", logger, args.parser, args.directory) + exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory) for canal in canalblog: try: o = urlparse(canal) @@ -148,12 +193,17 @@ if __name__ == '__main__': if args.css is False: exportWp.downloadCss() + del exportWp if args.html is False or args.img is False: - webpage = exportWp.getUrlPage() - if args.html is False: - exportWp.downloadHTML(webpage) - - if args.img is False: - exportWp.downloadImg(webpage) + try: + with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: + wait_for = [ + ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img) + for i in range(0, int(args.parallel)) + ] + except Exception as err: + logger.error("Threading error : {0}".format(err)) + + exit(0) \ No newline at end of file diff --git a/lib/WPExport.py b/lib/WPExport.py index da4f809..3bc4b6d 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -6,11 +6,12 @@ from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry class WPExport: - def __init__(self, url, logger, parser, directory): + def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup"): self._url = url self._logger = logger self._parser = parser self._dir = directory + self._name = name self._request = requests.Session() @@ -20,8 +21,17 @@ class WPExport: self._request.mount('http://', HTTPAdapter(max_retries=retries)) + # Destructor + def __del__(self): + self._logger.info("{0} : Export finished for {1}".format(self._name, self._url)) + # Public method + # Set name + + def setName(self, name): + self._name = "Thread-{0}".format(int(name) + 1) + # Set URL def setUrl(self, url): @@ -55,11 +65,11 @@ class WPExport: # Get URL - def getUrlPage(self): + def getUrlPage(self, index_thread, max_thread): try: page = self._request.get(self._url) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error : {1}".format(self._name, err)) exit(1) page_url = [] if page.status_code == 200: @@ -70,8 +80,8 @@ class WPExport: if href != "#": page_url.append(href) else: - self._logger.error("Url did not get due status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) webpage = [] @@ -79,10 +89,10 @@ class WPExport: try: page = self._request.get(i) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error : {1}".format(self._name, err)) exit(1) if page.status_code == 200: - self._logger.info("page : {0}".format(i)) + self._logger.info("{0} : page : {1}".format(self._name, i)) if i not in webpage: webpage.append(i) soup = BeautifulSoup(page.text, self._parser) @@ -94,13 +104,22 @@ class WPExport: element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] number_page = element_lastpage.split("-")[0].split("p")[1] number_lastpage = int(number_page) / 10 - for j in range(1,int(number_lastpage)): + + setPageDivided = int(number_lastpage) / max_thread + setPagePart = setPageDivided * (index_thread + 1) + firstPagePart = (setPagePart - setPageDivided) + + self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage))) + self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart))) + self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart))) + + for j in range(int(firstPagePart),int(setPagePart)): paging = j * 10 categorie = urlparse(i).path.split("/") url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging) if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) - self._logger.info(url_paging) + self._logger.info("{0} : {1}".format(self._name, url_paging)) if url_paging not in webpage: webpage.append(url_paging) page = self._request.get(url_paging) @@ -118,7 +137,7 @@ class WPExport: exit(1) webpage.append(o) else: - self._logger.error("web didn't get due status code : {0}".format(page.status_code)) + self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) self._logger.debug(page.content) return webpage @@ -135,7 +154,7 @@ class WPExport: makedir.append(i) repath = "/".join(makedir) if not os.path.exists(repath): - self._logger.debug("Dossier crée : {0}".format(repath)) + self._logger.debug("{0} : Dossier crée : {1}".format(self._name, repath)) try: if len(repath) > 0: os.mkdir(repath) @@ -201,21 +220,21 @@ class WPExport: try: page = self._request.get(i) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error : {1}".format(self._name, err)) exit(1) if page.status_code == 200: soup = BeautifulSoup(page.text, self._parser) img = soup.find_all("img") - self._logger.info("image from page: {0} : ".format(i)) + self._logger.info("{0} : image from page: {1} : ".format(self._name,i)) for anchor in img: src = anchor.get("src", "/") if src != "/": if src not in page_img: - self._logger.info("image: {0} : ".format(src)) + self._logger.info("{0} : image: {1} : ".format(self._name, src)) page_img.append(src) else: - self._logger.error("Image did not get due status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Image did not get due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) return page_img @@ -243,7 +262,7 @@ class WPExport: fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc) if len(dir_page_web) > 0 and len(filePageWeb) > 0: fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb) - self._logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) + self._logger.info("{0} : {1}/{2} : {3}".format(self._name, i+1, len(webpage), fileDownload)) try: open(fileDownload, "wb").write(r.content) except Exception as err: diff --git a/lib/WPImport.py b/lib/WPImport.py index d925f67..e3ed2ee 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -8,7 +8,8 @@ from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor - def __init__(self, basic, wordpress, logger, parser): + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser"): + self._name = name self._basic = basic self._wordpress = wordpress self._logger = logger @@ -22,6 +23,11 @@ class WPimport: self._request.mount('http://', HTTPAdapter(max_retries=retries)) + + # Destructor + def __del__(self): + self._logger.info("{0} : Import finished for {1}".format(self._name, self._wordpress)) + # Public method def setUrl(self, wordpress): @@ -32,10 +38,10 @@ class WPimport: try: r = self._request.get(webpage[i]) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) exit(1) if r.status_code == 200: - self._logger.info("({0}/{1} : Page en cours d'import : {2}".format(i+1, len(webpage), webpage[i])) + self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) soup = BeautifulSoup(r.content, self._parser) articlebody = soup.find_all("div", class_="articlebody") if len(articlebody) > 0: @@ -43,25 +49,38 @@ class WPimport: else: self._addOrUpdateFeaturedMedia(soup) else: - self._logger.error("Connection error with status code : {0}".format(r.status_code)) - self._logger.debug(r.content) + self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) - def fromDirectory(self, directory): + def fromDirectory(self, directory="", number_thread=1, max_thread=1): directory = "{0}/archives".format(directory) directories = self._getDirectories([], "{0}".format(directory)) - files = self._getFiles(directories) - self.fromFile(files) + if len(directories) > 0: + files = self._getFiles(directories) + self.fromFile(files, number_thread, max_thread) + else: + self._logger.error("{0} : No files for {1}".format(self._name, directory)) - def fromFile(self, files): - for file in files: - if os.path.exists(file): - self._logger.info("Fichier en cours de traitement : {0}".format(file)) - with open(file, 'r') as f: + def fromFile(self, files=[], number_thread=1, max_thread=1): + divFiles = int(len(files) / max_thread) + currentRangeFiles = int(divFiles * (number_thread+1)) + firstRange = int(currentRangeFiles - divFiles) + self._logger.debug("{0} : index : {1}".format(self._name,number_thread)) + + self._logger.debug("{0} : first range : {1}".format(self._name,firstRange)) + self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles)) + + for i in range(firstRange, currentRangeFiles): + if os.path.exists(files[i]): + self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i])) + with open(files[i], 'r') as f: content = f.read() + self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content))) soup = BeautifulSoup(content, self._parser) articlebody = soup.find_all("div", class_="articlebody") + self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody))) if len(articlebody) > 0: self._addOrUpdatePost(soup) else: @@ -99,7 +118,7 @@ class WPimport: try: page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error : {1}".format(self._name, err)) exit(1) if page.status_code == 200: result = page.json() @@ -111,7 +130,7 @@ class WPimport: try: page = self._request.get(img_src) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for get featured media : {1}".format(self._name, err)) exit(1) if page.status_code == 200: name_img = img_src.replace("_q", "") @@ -120,7 +139,7 @@ class WPimport: try: page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error search featured media : {1}".format(self._name, err)) exit(1) if page.status_code == 200: res = page.json() @@ -130,27 +149,27 @@ class WPimport: try: r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for post media featured : {1}".format(self._name, err)) exit(1) if r.status_code == 200: - self._logger.info("Ajout media featured : {0}".format(r.json()["title"]["raw"])) + self._logger.info("{0} : Add media featured : {1}".format(self._name, r.json()["title"]["raw"])) else: - self._logger.error("Connection error with status code : {0}".format(r.status_code)) - self._logger.debug(r.content) + self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) else: - self._logger.info("Aucun media trouvé pour {0}".format(h2)) + self._logger.info("{0} : No media found for {1}".format(self._name, h2)) else: - self._logger.error("Connection error with status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Connection error with status code for search featured media: {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: - self._logger.error("Connection error with status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Connection error for get featured media with status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: - self._logger.error("Connection error with status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) ## Association image to post @@ -161,13 +180,13 @@ class WPimport: try: r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for link image to post : {1}".format(self._name, err)) exit(1) if r.status_code == 200: - self._logger.info("Association d'une image à l'article {0}".format(title)) + self._logger.info("{0} : Link image to post {1}".format(self._name, title)) else: - self._logger.error("Connection error with status code : {0}".format(r.status_code)) - self._logger.debug(r.content) + self._logger.error("{0} Connection error with status code for link image to post : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) ## Add or update img @@ -176,12 +195,14 @@ class WPimport: media = {"id":"", "rendered":""} split_fileimg = href_img.split("/") img_name = split_fileimg[len(split_fileimg)-1] + self._logger.debug("{0} : Search for image {1} with URL {2}".format(self._name, img_name, "http://{0}/wp-json/wp/v2/media".format(self._wordpress))) params = { "search": img_name} try: r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err)) exit(1) + self._logger.debug("{0} : Search for image {1} and his status code {2}".format(self._name, img_name, r.status_code)) if r.status_code == 200: res = r.json() if len(res) > 0: @@ -189,13 +210,13 @@ class WPimport: try: r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err)) exit(1) if r.status_code == 200: - self._logger.info("Image removed {0}".format(img_name)) + self._logger.info("{0} : Image removed {1}".format(self._name, img_name)) else: - self._logger.error("Image not removed due status code : {0}".format(r.status_code)) - self._logger.debug(r.content) + self._logger.error("{0} : Image not removed due status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) data = page.content img_type = "image/png" @@ -205,20 +226,20 @@ class WPimport: try: r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err)) exit(1) if r.status_code == 201: - self._logger.info("Image added {0}".format(img_name)) + self._logger.info("{0} : Image added {1}".format(self._name, img_name)) res = r.json() media["id"] = res["id"] media["rendered"] = res["guid"]["rendered"] else: - self._logger.error("Image not added due status code : {0}".format(r.status_code)) + self._logger.error("{0} : Image not added due status code : {1}".format(self._name, r.status_code)) self._logger.debug(r.content) else: - self._logger.error("Connection error with status code : {0}".format(r.status_code)) - self._logger.debug(r.content) + self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) return media @@ -231,7 +252,7 @@ class WPimport: params = {"post": post, "author_name":i["author"], "date":i["date"]} page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for search comment : {1}".format(self._name, err)) exit(1) if page.status_code == 200: result = page.json() @@ -240,18 +261,18 @@ class WPimport: params = {"force":1} page = self._request.delete("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"]), params=params, auth=self._basic) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for delete comment : {1}".format(self._name, err)) exit(1) if page.status_code == 200: - self._logger.info("Comment deleted for {0}".format(title)) - self._logger.debug("Comment deleted : {0}".format(j)) + self._logger.info("{0} : Comment deleted for {1}".format(self._name, title)) + self._logger.debug("{0} : Comment deleted : {1}".format(self._name, j)) else: - self._logger.error("Comment not deleted for {0} due status code : {1}".format(title, page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Comment not deleted for {1} due status code : {2}".format(self._name, title, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: - self._logger.error("Comment not listed for {0} due status code : {1}".format(title, page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Comment not listed for {1} due status code : {2}".format(self._name, title, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) for i in comment: data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"} @@ -262,27 +283,27 @@ class WPimport: try: page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for parent comment : {1}".format(self._name, err)) exit(1) if page.status_code == 200: result = page.json() if len(result) > 0: data["parent"]=result[0]["id"] else: - self._logger.error("Connection error for parent comment with status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Connection error for parent comment with status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) try: page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for add comment : {1}".format(self._name, err)) exit(1) if page.status_code == 201: - self._logger.info("Comment added for {0}".format(title)) - self._logger.debug("Data : {0}".format(data)) + self._logger.info("{0} : Comment added for {1}".format(self._name, title)) + self._logger.debug("{0} : Data : {1}".format(self._name, data)) else: - self._logger.error("Comment not added for {0} due status code : {1}".format(title, page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Comment not added for {1} due status code : {2}".format(self._name, title, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) ## Check class name @@ -346,6 +367,7 @@ class WPimport: listelement[i] = [] articletitle = soup.find_all("h2", class_="articletitle") + self._logger.debug("{0} : Title of the article : {1}".format(self._name, articletitle)) articlebody = soup.find_all("div", class_="articlebody") articledate = soup.find_all("span", class_="articledate") articleacreator = soup.find_all("span", class_="articlecreator") @@ -353,10 +375,12 @@ class WPimport: itemfooter = soup.find_all("div", class_="itemfooter") comment = soup.find_all("li", class_="comment") img_a = articlebody[0].find_all("a", {"target": "_blank"}) + self._logger.debug("{0} : Number of image's link : {1}".format(self._name, len(img_a))) list_img = [] for i in img_a: new_img = {} img = i.find_all("img") + self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img))) if len(img) > 0: href_a = i.get("href") href_img = img[0].get("src") @@ -365,16 +389,16 @@ class WPimport: try: page_img = self._request.get(href_img) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) exit(1) if page_img.status_code == 404: href_img = href_a try: page_img = self._request.get(href_a) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) exit(1) - + self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code)) if page_img.status_code == 200: media=self._addOrUpdateMedia(href_img, page_img) new_img["id"]=media["id"] @@ -386,10 +410,10 @@ class WPimport: new_img["new_src"]=media["rendered"] list_img.append(new_img) if page_img.status_code not in [200, 404]: - self._logger.error("Connection error with status code : {0}".format(page_img.status_code)) - self._logger.debug(page_img.content) + self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page_img.content)) - + self._logger.debug("{0} : Number of image : {1}".format(self._name, len(list_img))) comment_post = self._getComment(comment) a = itemfooter[0].find_all("a", {"rel": True}) @@ -408,7 +432,7 @@ class WPimport: params = {"params":j} page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err)) exit(1) if page.status_code == 200: element_exist = True @@ -416,26 +440,26 @@ class WPimport: listelement[i].append(result[0]["id"]) else: - self._logger.error("{0} not found due status code : {1}".format(i, page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) if element_exist is False: data = {"name": j} - self._logger.debug("URL : {0} ".format("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i))) - self._logger.debug("data : {0}".format(data)) - self._logger.debug("headers : {0}".format(self._headers_form)) + self._logger.debug("{0} : URL : {1} ".format("http://{1}/wp-json/wp/v2/{2}".format(self._name, self._wordpress, i))) + self._logger.debug("{0} : data : {1}".format(self._name, data)) + self._logger.debug("{0} : headers : {1}".format(self._name, self._headers_form)) try: page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=data) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for post {1} : {2}".format(self._name, i, err)) exit(1) if page.status_code == 201: result = page.json() listelement[i].append(result["id"]) else: - self._logger.error("{0} not added due status code : {1}".format(i, page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : {1} not added due status code : {2}".format(self._name, i, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) title = articletitle[0].text author = articleacreator[0].text.lower() @@ -458,21 +482,21 @@ class WPimport: try: page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err)) exit(1) if page.status_code == 200: result = page.json() data["author"] = result[0]["id"] else: - self._logger.error("Connection error with status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(page.content)) params = {"search":title} try: page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err)) exit(1) page_exist = True headers = {'Content-Type': 'application/json', 'Accept':'application/json'} @@ -481,38 +505,38 @@ class WPimport: if len(result) == 0: page_exist = False else: - self._logger.info("La page {0} existe deja et mis à jour".format(title)) + self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) post_id = result[0]["id"] try: page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data)) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) exit(1) if page.status_code == 200: result = page.json() - self._logger.info("Article mis à jour : {0}".format(result["title"]["raw"])) + self._logger.info("{0} : Post updated : {1}".format(self._name, result["title"]["raw"])) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"]) else: - self._logger.error("Post not updated due status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) else: - self._logger.error("Connection for update post error with status code : {0}".format(page.status_code)) - self._logger.debug(page.content) + self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) if page_exist == False: try: page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) except Exception as err: - self._logger.error("Connection error : {0}".format(err)) + self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err)) exit(1) if page.status_code == 201: result = page.json() - self._logger.info("Article ajoute : {0}".format(result["title"]["raw"])) + self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"])) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"]) else: - self._logger.error("Post not added due status code : {0}".format(r.status_code)) - self._logger.debug(r.content) + self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content))