web_scrap/import_export_canalblog.py

359 lines
17 KiB
Python

#!/usr/bin/python3
from requests.auth import HTTPBasicAuth
from getpass import getpass
from urllib.parse import urlparse
from concurrent import futures
from concurrent.futures import as_completed, wait, ALL_COMPLETED
import argparse, logging, threading, os, glob
from lib.WPImport import WPimport
from lib.WPExport import WPExport
from lib.WPRemove import WPRemove
from lib.WPChange import WPChange
from lib.WPMenu import WPMenu
def errorRevert(logger, revert, tmp):
if revert is True:
files_tmp = glob.glob("{0}/*.json".format(tmp))
if len(files_tmp) == 0:
logger.error("Error revert, because files not found")
exit(1)
if len(files_tmp) != int(args.parallel):
for file_r in files_tmp:
os.remove(file_r)
logger.error("Error revert, because number files tmp is incompatible with parallel number")
exit(1)
def change(index, number, args, logger, tmp, revert):
changeWp = WPChange(logger=logger, index_name=index, number_thread=number, tmp=tmp)
changeWp.fromDirectory(args.directory, revert)
del changeWp
def remove(index, number, args, basic, logger, ssl_wordpress):
removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number)
if args.remove == True:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
removeWp.cleanPosts()
removeWp.cleanTags()
removeWp.cleanCategories()
removeWp.cleanMedia()
else:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
if args.posts == True:
removeWp.cleanPosts()
if args.categories == True:
removeWp.cleanCategories()
if args.tags == True:
removeWp.cleanTags()
if args.media == True:
removeWp.cleanMedia()
del removeWp
def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog)
if not revert:
exportWp.getUrlPage(name_thread, max_thread)
for i in ["article", "page"]:
for j in ["publications", "principal"]:
if html is False:
exportWp.downloadHTML(j, i)
if img is False:
exportWp.downloadImg(j, i)
del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
protocol = "https"
if ssl_canalblog is False:
protocol = "http"
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme=protocol)
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp)
if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(l, k)
del importWp
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme=protocol)
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k])
del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, revert):
name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",")
wordpress = wordpress.split(",")
if serial is False:
for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
for j in directory:
importWp.fromDirectory(j, name_thread, max_thread, revert)
del importWp
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is different than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
importWp.fromDirectory(directory[i], name_thread, max_thread, revert)
del importWp
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="")
parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true")
parser.add_argument("--tmp", help="directory tmp", default="/tmp/import_export_canablog")
subparsers = parser.add_subparsers(dest="command")
import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--password", help="password wordpress's user", default="")
import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="")
import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
import_parser.add_argument("--serial", help="Serial execution", action="store_true")
import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true")
import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true")
import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true")
import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true")
import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true")
import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true")
import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true")
import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true")
remove_parser = subparsers.add_parser("remove")
remove_parser.add_argument("--user", help="wordpress user", required=True)
remove_parser.add_argument("--password", help="password wordpress's user", default="")
remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true")
remove_parser.add_argument("--posts", help="Remove all posts", action="store_true")
remove_parser.add_argument("--categories", help="Remove all categories", action="store_true")
remove_parser.add_argument("--tags", help="Remove all tags", action="store_true")
remove_parser.add_argument("--media", help="Remove all media", action="store_true")
export_parser = subparsers.add_parser("export")
export_parser.add_argument("--url", help="canblog URL to be scraping", required=True)
export_parser.add_argument("--directory",
default="backup",
help="backup file path")
export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
change_parser = subparsers.add_parser("change")
change_parser.add_argument("--directory",
default="",
help="Directory")
change_parser.add_argument("--file",
default="",
help="File")
menu_parser = subparsers.add_parser("menu")
menu_parser.add_argument("--user", help="wordpress user", required=True)
menu_parser.add_argument("--password", help="password wordpress's user", default="")
menu_parser.add_argument("--file", help="HTML file", default="")
menu_parser.add_argument("--canalblog", help="URL Canalblog", default="")
menu_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
args = parser.parse_args()
logger = logging.getLogger('import export canalblog')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ssl_canalblog = True
ssl_wordpress = True
for i in args.ssl.split(","):
if i == "canalblog":
ssl_canalblog = False
if i == "wordpress":
ssl_wordpress = False
if args.quiet is False:
ch = logging.StreamHandler()
if args.debug is True:
logger.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
if len(args.logfile) > 0:
fileHandler = logging.FileHandler(args.logfile)
if args.debug is True:
fileHandler.setLevel(logging.DEBUG)
else:
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)
os.makedirs(args.tmp, exist_ok=True)
if args.command == "import" or args.command == "remove" or args.command = "menu":
password = args.password
if len(args.password) == 0:
password = getpass()
if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
basic = HTTPBasicAuth(args.user, password)
if args.command == "import":
wordpress = args.wordpress.split(",")
importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress)
if len(args.file) > 0:
for i in wordpress:
importWp.setUrl(i)
importWp.fromFile(files=args.file.split(","))
if len(args.directory) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED)
errorRevert(logger, args.revert, args.tmp)
wait_for = [
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, args.revert)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
if len(args.canalblog) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED)
errorRevert(logger, args.revert, args.tmp)
wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)
if args.command == "export":
canalblog = args.url.split(",")
protocol = "https"
if ssl_canalblog is False:
protocol = "http"
exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory, ssl_canalblog=ssl_canalblog)
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme=protocol)
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
if args.js is False:
exportWp.downloadJs()
if args.css is False:
exportWp.downloadCss()
del exportWp
if args.html is False or args.img is False:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, args.tmp)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)
if args.command == "remove":
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
except Exception as err:
logger.error("Thread error for remove : {0}".format(err))
exit(0)
if args.command == "change":
if len(args.directory) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
errorRevert(logger, args.revert, args.tmp)
wait_for = [ ex.submit(change, i, args.parallel, args, logger, args.tmp, args.revert) for i in range(0, int(args.parallel)) ]
except Exception as err:
logger.error("Thread error for remove : {0}".format(err))
if len(args.file) > 0:
changeWp = WPChange(logger=logger)
for filei in args.file.split(","):
changeWp.fromFile(filei)
exit(0)
if args.command == "menu":
menuWp = WPMenu(name="Thread-1", basic=basic, wordpress=args.wordpress, logger=logger, parser=args.parser, ssl_canalblog=ssl_canalblog, ssl_wordpress=ssl_wordpress)
if len(args.file) > 0:
menuWp.fromFile(args.file)
if len(args.canalblog) > 0:
menuWp.fromUrl(args.canalblog)
exit(0)