From bba6cd1ca7561f3a8c8be70ce1aee6fd6eb4b2b0 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 9 Apr 2023 22:49:44 +0200 Subject: [PATCH] add export canalblog --- WPExport.py | 264 +++++++++++++++++-------------------- import_export_canalblog.py | 28 +++- 2 files changed, 147 insertions(+), 145 deletions(-) diff --git a/WPExport.py b/WPExport.py index 6fd0b8a..6db20ad 100644 --- a/WPExport.py +++ b/WPExport.py @@ -4,93 +4,44 @@ from urllib.parse import urlparse import requests, os, argparse, logging class WPExport: - def __init__(self, url, logger, parser): + def __init__(self, url, logger, parser, directory): self._url = url self._logger = logger self._parser = parser - - def _mkdirPath(self, path_dir, logger): - if not os.path.exists(path_dir): - makedir = [] - pathh = path_dir.split("/") - for i in pathh: - makedir.append(i) - repath = "/".join(makedir) - if not os.path.exists(repath): - self._logger.debug("Dossier crée : {0}".format(repath)) - try: - if len(repath) > 0: - os.mkdir(repath) - except Exception as err: - self._logger.error("Directory error : {0}".format(err)) - self._logger.debug("Directory error : {0} {1} {2} {3} {4}".format(err, path_dir, repath, pathh, makedir)) - exit(1) + self._dir = directory - def _getScriptCss(self, js, css, logger): - try: - page = requests.get(url) - except Exception as err: - self._logger.error("Connection error : {0}".format(err)) - exit(1) - page_url = [] - if page.status_code == 200: - soup = BeautifulSoup(page.text, self._parser) - if js is True: - script = soup.find_all("script") - for anchor in script: - src = anchor.get("src", "/") - if src != "/": - try: - u = urlparse(url) - o = urlparse(src) - except Exception as err: - self._logger.error("parsing error : {0}".format(err)) - exit(1) - if o.netloc == "": - o = o._replace(netloc=u.netloc) - o = o._replace(scheme=u.scheme) - page_url.append(o.geturl()) - if css is True: - link = soup.find_all("link") - for anchor in link: - rel = anchor.get("rel") - if rel[0] == "stylesheet": - href = anchor.get("href", "/") - if href != "/": - try: - u = urlparse(url) - o = urlparse(href) - except Exception as err: - self._logger.error("parsing error : {0}".format(err)) - exit(1) - if o.netloc == "": - o = o._replace(netloc=u.netloc) - o = o._replace(scheme=u.scheme) - page_url.append(o.geturl()) - return page_url + # Public method - def _getImg(self, webpage): - page_img = [] - for i in webpage: - try: - page = requests.get(i) - except Exception as err: - self._logger.error("Connection error : {0}".format(err)) - exit(1) - if page.status_code == 200: - soup = BeautifulSoup(page.text, self._parser) - img = soup.find_all("img") - self._logger.info("image from page: {0} : ".format(i)) - for anchor in img: - src = anchor.get("src", "/") - if src != "/": - if src not in page_img: - self._logger.info("image: {0} : ".format(src)) - page_img.append(src) - return page_img + # Download JS - def _getUrlPage(self): + def downloadJs(self): + script = self._getScriptCss(True, False) + o = urlparse(self._url) + self._downloadPage(script, "{0}/{1}/{2}".format(self._dir, o.path, "dists/js")) + + # Download CSS + + def downloadCss(self): + css = self._getScriptCss(False, True) + o = urlparse(self._url) + self._downloadPage(script, "{0}/{1}/{2}".format(self._dir, o.path, "dists/css")) + + # Download HTML + + def downloadHTML(self, webpage): + self._downloadPage(webpage, self._dir) + + # Download Image + + def downloadImg(self, webpage): + page_src = self._getImg(webpage) + o = urlparse(self._url) + self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img")) + + + # Get URL + def getUrlPage(self): try: page = requests.get(self._url) except Exception as err: @@ -151,6 +102,95 @@ class WPExport: return webpage + # Private method + # + # Create path + def _mkdirPath(self, path_dir, logger): + if not os.path.exists(path_dir): + makedir = [] + pathh = path_dir.split("/") + for i in pathh: + makedir.append(i) + repath = "/".join(makedir) + if not os.path.exists(repath): + self._logger.debug("Dossier crée : {0}".format(repath)) + try: + if len(repath) > 0: + os.mkdir(repath) + except Exception as err: + self._logger.error("Directory error : {0}".format(err)) + self._logger.debug("Directory error : {0} {1} {2} {3} {4}".format(err, path_dir, repath, pathh, makedir)) + exit(1) + + + # Get Css and JS + def _getScriptCss(self, js, css): + try: + page = requests.get(url) + except Exception as err: + self._logger.error("Connection error : {0}".format(err)) + exit(1) + page_url = [] + if page.status_code == 200: + soup = BeautifulSoup(page.text, self._parser) + if js is True: + script = soup.find_all("script") + for anchor in script: + src = anchor.get("src", "/") + if src != "/": + try: + u = urlparse(url) + o = urlparse(src) + except Exception as err: + self._logger.error("parsing error : {0}".format(err)) + exit(1) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + if css is True: + link = soup.find_all("link") + for anchor in link: + rel = anchor.get("rel") + if rel[0] == "stylesheet": + href = anchor.get("href", "/") + if href != "/": + try: + u = urlparse(url) + o = urlparse(href) + except Exception as err: + self._logger.error("parsing error : {0}".format(err)) + exit(1) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + return page_url + + # Get image + + def _getImg(self, webpage): + page_img = [] + for i in webpage: + try: + page = requests.get(i) + except Exception as err: + self._logger.error("Connection error : {0}".format(err)) + exit(1) + if page.status_code == 200: + soup = BeautifulSoup(page.text, self._parser) + img = soup.find_all("img") + self._logger.info("image from page: {0} : ".format(i)) + for anchor in img: + src = anchor.get("src", "/") + if src != "/": + if src not in page_img: + self._logger.info("image: {0} : ".format(src)) + page_img.append(src) + return page_img + + + # Download page def _downloadPage(self, webpage, backup_dir): for i in range(0, len(webpage)): @@ -178,66 +218,4 @@ class WPExport: open(fileDownload, "wb").write(r.content) except Exception as err: self._logger.error("file error : {0}".format(err)) - exit(1) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--url", help="canblog URL to be scraping", required=True) - parser.add_argument("--dir", - default="backup", - help="backup file path") - parser.add_argument("--debug", help="Verbosity", action="store_true") - parser.add_argument("--logfile", help="Log file", default="") - parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") - parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") - parser.add_argument("--no-img", help="No img", dest="img", action="store_true") - parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") - parser.add_argument("--quiet", help="No console output", action="store_true") - args = parser.parse_args() - logger = logging.getLogger('web_scrap') - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - - if args.quiet is False: - ch = logging.StreamHandler() - if args.debug is True: - logger.setLevel(logging.DEBUG) - ch.setLevel(logging.DEBUG) - else: - logger.setLevel(logging.INFO) - ch.setLevel(logging.INFO) - ch.setFormatter(formatter) - logger.addHandler(ch) - - - if len(args.logfile) > 0: - fileHandler = logging.FileHandler(args.logfile) - if args.debug is True: - fileHandler.setLevel(logging.DEBUG) - else: - fileHandler.setLevel(logging.INFO) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) - - try: - o = urlparse(args.url) - o = o._replace(scheme="https") - url = o.geturl().replace(":///", "://") - except Exception as err: - logger.error("parsing error : {0}".format(err)) - if args.js is False: - script = getScriptCss(url, True, False, logger) - downloadPage(script, "{0}/{1}/{2}".format(args.dir, o.path, "dists/js"), logger) - - if args.css is False: - css = getScriptCss(url, False, True, logger) - downloadPage(css, "{0}/{1}/{2}".format(args.dir, o.path, "dists/css"), logger) - - if args.html is False or args.img is False: - webpage = getUrlPage(url, logger) - if args.html is False: - downloadPage(webpage, args.dir, logger) - - if args.img is False: - page_src = getImg(webpage, logger) - downloadPage(page_src, "{0}/{1}/{2}".format(args.dir, o.path, "img"), logger) + exit(1) \ No newline at end of file diff --git a/import_export_canalblog.py b/import_export_canalblog.py index cc19a06..b38d5be 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -18,6 +18,7 @@ if __name__ == '__main__': import_parser.add_argument("--user", help="wordpress user", required=True) import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--directory", help="HTML directory", default="") + import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) export_parser = subparsers.add_parser("export") @@ -59,7 +60,7 @@ if __name__ == '__main__': fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) - if args.command == "export": + if args.command == "import": password = getpass() if len(password) == 0: logger.error("No password error !!! ") @@ -71,4 +72,27 @@ if __name__ == '__main__': importWp.fromFile(args.file.split(",")) exit(0) if len(args.directory) > 0: - importWp.fromDirectory(args.directory) \ No newline at end of file + importWp.fromDirectory(args.directory) + exit(0) + if args.command == "export": + try: + o = urlparse(args.url) + o = o._replace(scheme="https") + url = o.geturl().replace(":///", "://") + except Exception as err: + logger.error("parsing error : {0}".format(err)) + exit(1) + exportWp = WPExport.WPExport(url, logger, args.parser, args.dir) + if args.js is False: + exportWp.downloadJs() + + if args.css is False: + exportWp.downloadCss() + + if args.html is False or args.img is False: + webpage = exportWp.getUrlPage() + if args.html is False: + exportWp.downloadHTML(webpage) + + if args.img is False: + exportWp.downloadImg(webpage) \ No newline at end of file