#!/usr/bin/python3 from requests.auth import HTTPBasicAuth from getpass import getpass from urllib.parse import urlparse from concurrent import futures import argparse, logging, threading from lib.WPImport import WPimport from lib.WPExport import WPExport def download(name_thread, max_thread, url, logger, parser, directory, html, img): exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) webpage = exportWp.getUrlPage(name_thread, max_thread) if html is False: exportWp.downloadHTML(webpage) if args.img is False: exportWp.downloadImg(webpage) del exportWp def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial): canalblog = canalblog.split(",") wordpress = wordpress.split(",") name = "Thread-{0}".format(int(name_thread) + 1) if serial is False: for canal in canalblog: try: o = urlparse(canal) o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") except Exception as err: logger.error("{0} : parsing error : {1}".format(name, err)) exit(1) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser) webpage = exportWp.getUrlPage(name_thread, max_thread) del exportWp for j in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser) importWp.fromUrl(webpage) del importWp else: if len(canalblog) != len(wordpress): logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name)) exit(1) for i in range(0, len(canalblog)-1): try: o = urlparse(canalblog[i]) o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") except Exception as err: logger.error("parsing error : {0}".format(err)) exit(1) exportWp = WPExport(name=name, url=url, logger=logger, parser=parser) webpage = exportWp.getUrlPage(name_thread, max_thread) del exportWp importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) importWp.fromUrl(webpage) del importWp def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial): name = "Thread-{0}".format(int(name_thread) + 1) directory = directory.split(",") wordpress = wordpress.split(",") if serial is False: for i in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser) for j in directory: importWp.fromDirectory(j, name_thread, max_thread) del importWp else: if len(directory) != len(wordpress): logger.error("{0} : Error : Number directory is differant than wordpress".format(name)) exit(1) for i in range(0, len(wordpress)-1): importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser) importWp.fromDirectory(directory[i]) del importWp if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--debug", help="Verbosity", action="store_true") parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) subparsers = parser.add_subparsers(dest="command") import_parser = subparsers.add_parser("import") import_parser.add_argument("--user", help="wordpress user", required=True) import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--directory", help="HTML directory", default="") import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") export_parser = subparsers.add_parser("export") export_parser.add_argument("--url", help="canblog URL to be scraping", required=True) export_parser.add_argument("--directory", default="backup", help="backup file path") export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true") export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") args = parser.parse_args() logger = logging.getLogger('import export canalblog') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.quiet is False: ch = logging.StreamHandler() if args.debug is True: logger.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) if len(args.logfile) > 0: fileHandler = logging.FileHandler(args.logfile) if args.debug is True: fileHandler.setLevel(logging.DEBUG) else: fileHandler.setLevel(logging.INFO) fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) if args.command == "import": password = getpass() if len(password) == 0: logger.error("No password error !!! ") exit(1) basic = HTTPBasicAuth(args.user, password) wordpress = args.wordpress.split(",") importWp = WPimport(basic, "", logger, args.parser) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) importWp.fromFile(files=args.file.split(",")) if len(args.directory) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) if len(args.canalblog) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) exit(0) if args.command == "export": canalblog = args.url.split(",") exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory) for canal in canalblog: try: o = urlparse(canal) o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") except Exception as err: logger.error("parsing error : {0}".format(err)) exit(1) exportWp.setUrl(url) if args.js is False: exportWp.downloadJs() if args.css is False: exportWp.downloadCss() del exportWp if args.html is False or args.img is False: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) exit(0)