#!/usr/bin/python3 from requests.auth import HTTPBasicAuth from getpass import getpass from urllib.parse import urlparse from concurrent import futures from concurrent.futures import as_completed, wait, ALL_COMPLETED import argparse, logging, threading, os, glob from lib.WPImport import WPimport from lib.WPExport import WPExport from lib.WPRemove import WPRemove from lib.WPChange import WPChange from lib.WPMenu import WPMenu def errorRevert(logger, revert, tmp): if revert is True: files_tmp = glob.glob("{0}/*.json".format(tmp)) if len(files_tmp) == 0: logger.error("Error revert, because files not found") exit(1) if len(files_tmp) != int(args.parallel): for file_r in files_tmp: os.remove(file_r) logger.error("Error revert, because number files tmp is incompatible with parallel number") exit(1) def change(index, number, args, logger, tmp, revert): changeWp = WPChange(logger=logger, index_name=index, number_thread=number, tmp=tmp) changeWp.fromDirectory(args.directory, revert) del changeWp def remove(index, number, args, basic, logger, ssl_wordpress): removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number) if args.remove == True: for i in args.wordpress.split(","): removeWp.setUrl(i) removeWp.cleanPosts() removeWp.cleanTags() removeWp.cleanCategories() removeWp.cleanMedia() else: for i in args.wordpress.split(","): removeWp.setUrl(i) if args.posts == True: removeWp.cleanPosts() if args.categories == True: removeWp.cleanCategories() if args.tags == True: removeWp.cleanTags() if args.media == True: removeWp.cleanMedia() del removeWp def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp): exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog) if not revert: exportWp.getUrlPage(name_thread, max_thread) for i in ["article", "page"]: for j in ["publications", "principal"]: if html is False: exportWp.downloadHTML(j, i) if img is False: exportWp.downloadImg(j, i) del exportWp def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp, author): canalblog = canalblog.split(",") wordpress = wordpress.split(",") name = "Thread-{0}".format(int(name_thread) + 1) protocol = "https" if ssl_canalblog is False: protocol = "http" if serial is False: for canal in canalblog: try: o = urlparse(canal) o = o._replace(scheme=protocol) url = o.geturl().replace(":///", "://") except Exception as err: logger.error("{0} : parsing error : {1}".format(name, err)) exit(1) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp) if not revert: exportWp.getUrlPage(name_thread, max_thread) del exportWp for j in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, author=author) for k in ["article", "page"]: for l in ["publications", "principal"]: importWp.fromUrl(l, k) del importWp else: if len(canalblog) != len(wordpress): logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name)) exit(1) for i in range(0, len(canalblog)-1): try: o = urlparse(canalblog[i]) o = o._replace(scheme=protocol) url = o.geturl().replace(":///", "://") except Exception as err: logger.error("parsing error : {0}".format(err)) exit(1) exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) if not revert: exportWp.getUrlPage(name_thread, max_thread) del exportWp importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, author=author) for k in ["article", "page"]: for l in ["publications", "principal"]: importWp.fromUrl(webpage[l][k]) del importWp def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, revert, author): name = "Thread-{0}".format(int(name_thread) + 1) directory = directory.split(",") wordpress = wordpress.split(",") if serial is False: for i in wordpress: importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, author=author) for j in directory: importWp.fromDirectory(j, name_thread, max_thread, revert) del importWp else: if len(directory) != len(wordpress): logger.error("{0} : Error : Number directory is different than wordpress".format(name)) exit(1) for i in range(0, len(wordpress)-1): importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, author=author) importWp.fromDirectory(directory[i], name_thread, max_thread, revert) del importWp if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--debug", help="Verbosity", action="store_true") parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="") parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true") parser.add_argument("--tmp", help="directory tmp", default="/tmp/import_export_canablog") subparsers = parser.add_subparsers(dest="command") import_parser = subparsers.add_parser("import") import_parser.add_argument("--user", help="wordpress user", required=True) import_parser.add_argument("--password", help="password wordpress's user", default="") import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--directory", help="HTML directory", default="") import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--serial", help="Serial execution", action="store_true") import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true") import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true") import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true") import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true") import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true") import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true") import_parser.add_argument("--author-album", dest=author, help="Define author for page album", default="") remove_parser = subparsers.add_parser("remove") remove_parser.add_argument("--user", help="wordpress user", required=True) remove_parser.add_argument("--password", help="password wordpress's user", default="") remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True) remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true") remove_parser.add_argument("--posts", help="Remove all posts", action="store_true") remove_parser.add_argument("--categories", help="Remove all categories", action="store_true") remove_parser.add_argument("--tags", help="Remove all tags", action="store_true") remove_parser.add_argument("--media", help="Remove all media", action="store_true") export_parser = subparsers.add_parser("export") export_parser.add_argument("--url", help="canblog URL to be scraping", required=True) export_parser.add_argument("--directory", default="backup", help="backup file path") export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true") export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") change_parser = subparsers.add_parser("change") change_parser.add_argument("--directory", default="", help="Directory") change_parser.add_argument("--file", default="", help="File") menu_parser = subparsers.add_parser("menu") menu_parser.add_argument("--user", help="wordpress user", required=True) menu_parser.add_argument("--password", help="password wordpress's user", default="") menu_parser.add_argument("--file", help="HTML file", default="") menu_parser.add_argument("--canalblog", help="URL Canalblog", default="") menu_parser.add_argument("--wordpress", help="URL Wordpress", required=True) args = parser.parse_args() logger = logging.getLogger('import export canalblog') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ssl_canalblog = True ssl_wordpress = True for i in args.ssl.split(","): if i == "canalblog": ssl_canalblog = False if i == "wordpress": ssl_wordpress = False if args.quiet is False: ch = logging.StreamHandler() if args.debug is True: logger.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) if len(args.logfile) > 0: fileHandler = logging.FileHandler(args.logfile) if args.debug is True: fileHandler.setLevel(logging.DEBUG) else: fileHandler.setLevel(logging.INFO) fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) os.makedirs(args.tmp, exist_ok=True) if args.command == "import" or args.command == "remove" or args.command == "menu": password = args.password if len(args.password) == 0: password = getpass() if len(password) == 0: logger.error("No password error !!! ") exit(1) basic = HTTPBasicAuth(args.user, password) if args.command == "import": wordpress = args.wordpress.split(",") importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) importWp.fromFile(files=args.file.split(",")) menuWp = WPMenu(name="Thread-1", basic=basic, wordpress=args.wordpress, logger=logger, parser=args.parser, ssl_canalblog=ssl_canalblog, ssl_wordpress=ssl_wordpress) menuWp.fromFile("{0}".format(args.file.split(",")[0])) if len(args.directory) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) errorRevert(logger, args.revert, args.tmp) wait_for = [ ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, args.revert, args.author) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) menuWp = WPMenu(name="Thread-1", basic=basic, wordpress=args.wordpress, logger=logger, parser=args.parser, ssl_canalblog=ssl_canalblog, ssl_wordpress=ssl_wordpress) menuWp.fromFile("{0}/index.html".format(args.directory)) except Exception as err: logger.error("Threading error : {0}".format(err)) if len(args.canalblog) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) errorRevert(logger, args.revert, args.tmp) wait_for = [ ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp, args.author) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) menuWp = WPMenu(name="Thread-1", basic=basic, wordpress=args.wordpress, logger=logger, parser=args.parser, ssl_canalblog=ssl_canalblog, ssl_wordpress=ssl_wordpress) menuWp.fromUrl(args.canalblog) except Exception as err: logger.error("Threading error : {0}".format(err)) exit(0) if args.command == "export": canalblog = args.url.split(",") protocol = "https" if ssl_canalblog is False: protocol = "http" exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory, ssl_canalblog=ssl_canalblog) for canal in canalblog: try: o = urlparse(canal) o = o._replace(scheme=protocol) url = o.geturl().replace(":///", "://") except Exception as err: logger.error("parsing error : {0}".format(err)) exit(1) exportWp.setUrl(url) if args.js is False: exportWp.downloadJs() if args.css is False: exportWp.downloadCss() del exportWp if args.html is False or args.img is False: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, args.tmp) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Threading error : {0}".format(err)) exit(0) if args.command == "remove": try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Thread error for remove : {0}".format(err)) exit(0) if args.command == "change": if len(args.directory) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: errorRevert(logger, args.revert, args.tmp) wait_for = [ ex.submit(change, i, args.parallel, args, logger, args.tmp, args.revert) for i in range(0, int(args.parallel)) ] except Exception as err: logger.error("Thread error for remove : {0}".format(err)) if len(args.file) > 0: changeWp = WPChange(logger=logger) for filei in args.file.split(","): changeWp.fromFile(filei) exit(0) if args.command == "menu": menuWp = WPMenu(name="Thread-1", basic=basic, wordpress=args.wordpress, logger=logger, parser=args.parser, ssl_canalblog=ssl_canalblog, ssl_wordpress=ssl_wordpress) if len(args.file) > 0: menuWp.fromFile(args.file) if len(args.canalblog) > 0: menuWp.fromUrl(args.canalblog) exit(0)