#!/usr/bin/python3 from requests.auth import HTTPBasicAuth from getpass import getpass from urllib.parse import urlparse from concurrent import futures from concurrent.futures import as_completed, wait, ALL_COMPLETED import argparse, logging, threading, os, glob from lib.WPImport import WPimport from lib.WPExport import WPExport from lib.WPRemove import WPRemove from lib.WPChange import WPChange def change(index, number, args, logger): changeWp = WPChange(logger=logger, index_name=index, number_thread=number) changeWp.fromDirectory(args.directory) del changeWp def remove(index, number, args, basic, logger, ssl_wordpress): removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number) if args.remove == True: for i in args.wordpress.split(","): removeWp.setUrl(i) removeWp.cleanPosts() removeWp.cleanTags() removeWp.cleanCategories() removeWp.cleanMedia() else: for i in args.wordpress.split(","): removeWp.setUrl(i) if args.posts == True: removeWp.cleanPosts() if args.categories == True: removeWp.cleanCategories() if args.tags == True: removeWp.cleanTags() if args.media == True: removeWp.cleanMedia() del removeWp def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp): exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog) if not revert: exportWp.getUrlPage(name_thread, max_thread) for i in ["article", "page"]: for j in ["publications", "principal"]: if html is False: exportWp.downloadHTML(j, i) if img is False: exportWp.downloadImg(j, i) del exportWp def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp): canalblog = canalblog.split(",") wordpress = wordpress.split(",") name = "Thread-{0}".format(int(name_thread) + 1) protocol = "https" if ssl_canalblog is False: protocol = "http" if serial is False: for canal in canalblog: try: o = urlparse(canal) o = o._replace(scheme=protocol) url = o.geturl().replace(":///", "://") except Exception as err: logger.error("{0} : parsing error : {1}".format(name, err)) exit(1) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp) if not revert: exportWp.getUrlPage(name_thread, max_thread) del exportWp for j in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp) for k in ["article", "page"]: for l in ["publications", "principal"]: importWp.fromUrl(l, k) del importWp else: if len(canalblog) != len(wordpress): logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name)) exit(1) for i in range(0, len(canalblog)-1): try: o = urlparse(canalblog[i]) o = o._replace(scheme=protocol) url = o.geturl().replace(":///", "://") except Exception as err: logger.error("parsing error : {0}".format(err)) exit(1) exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) if not revert: exportWp.getUrlPage(name_thread, max_thread) del exportWp importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp) for k in ["article", "page"]: for l in ["publications", "principal"]: importWp.fromUrl(webpage[l][k]) del importWp def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image): name = "Thread-{0}".format(int(name_thread) + 1) directory = directory.split(",") wordpress = wordpress.split(",") if serial is False: for i in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) for j in directory: importWp.fromDirectory(j, name_thread, max_thread) del importWp else: if len(directory) != len(wordpress): logger.error("{0} : Error : Number directory is different than wordpress".format(name)) exit(1) for i in range(0, len(wordpress)-1): importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp.fromDirectory(directory[i]) del importWp if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--debug", help="Verbosity", action="store_true") parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="") parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true") parser.add_argument("--tmp", help="directory tmp", default="/tmp/import_export_canablog") subparsers = parser.add_subparsers(dest="command") import_parser = subparsers.add_parser("import") import_parser.add_argument("--user", help="wordpress user", required=True) import_parser.add_argument("--password", help="password wordpress's user", default="") import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--directory", help="HTML directory", default="") import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true") import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true") import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true") import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true") import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true") import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true") remove_parser = subparsers.add_parser("remove") remove_parser.add_argument("--user", help="wordpress user", required=True) remove_parser.add_argument("--password", help="password wordpress's user", default="") remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True) remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true") remove_parser.add_argument("--posts", help="Remove all posts", action="store_true") remove_parser.add_argument("--categories", help="Remove all categories", action="store_true") remove_parser.add_argument("--tags", help="Remove all tags", action="store_true") remove_parser.add_argument("--media", help="Remove all media", action="store_true") export_parser = subparsers.add_parser("export") export_parser.add_argument("--url", help="canblog URL to be scraping", required=True) export_parser.add_argument("--directory", default="backup", help="backup file path") export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true") export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") change_parser = subparsers.add_parser("change") change_parser.add_argument("--directory", default="", help="Directory") change_parser.add_argument("--file", default="", help="File") args = parser.parse_args() logger = logging.getLogger('import export canalblog') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ssl_canalblog = True ssl_wordpress = True for i in args.ssl.split(","): if i == "canalblog": ssl_canalblog = False if i == "wordpress": ssl_wordpress = False if args.quiet is False: ch = logging.StreamHandler() if args.debug is True: logger.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) if len(args.logfile) > 0: fileHandler = logging.FileHandler(args.logfile) if args.debug is True: fileHandler.setLevel(logging.DEBUG) else: fileHandler.setLevel(logging.INFO) fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) os.makedirs(args.tmp, exist_ok=True) if args.command == "import" or args.command == "remove": password = args.password if len(args.password) == 0: password = getpass() if len(password) == 0: logger.error("No password error !!! ") exit(1) basic = HTTPBasicAuth(args.user, password) if args.command == "import": wordpress = args.wordpress.split(",") importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) importWp.fromFile(files=args.file.split(",")) if len(args.directory) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) wait_for = [ ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) if len(args.canalblog) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) if args.revert is True: files_tmp = glob.glob("{0}/*.json".format(args.tmp)) if len(files_tmp) == 0: logger.error("Error revert, because files not found") exit(1) if len(files_tmp) != int(args.parallel): for file_r in files_tmp: os.remove(file_r) wait_for = [ ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) exit(0) if args.command == "export": canalblog = args.url.split(",") protocol = "https" if ssl_canalblog is False: protocol = "http" exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory, ssl_canalblog=ssl_canalblog) for canal in canalblog: try: o = urlparse(canal) o = o._replace(scheme=protocol) url = o.geturl().replace(":///", "://") except Exception as err: logger.error("parsing error : {0}".format(err)) exit(1) exportWp.setUrl(url) if args.js is False: exportWp.downloadJs() if args.css is False: exportWp.downloadCss() del exportWp if args.html is False or args.img is False: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, args.tmp) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) exit(0) if args.command == "remove": try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Thread error for remove : {0}".format(err)) exit(0) if args.command == "change": if len(args.directory) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(change, i, args.parallel, args, logger) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Thread error for remove : {0}".format(err)) if len(args.file) > 0: changeWp = WPChange(logger=logger) for filei in args.file.split(","): changeWp.fromFile(filei) exit(0)