2023-03-23 23:28:57 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
from requests.auth import HTTPBasicAuth
|
|
|
|
from getpass import getpass
|
2023-04-09 23:49:10 +02:00
|
|
|
from urllib.parse import urlparse
|
2023-04-25 00:34:25 +02:00
|
|
|
from concurrent import futures
|
|
|
|
|
2023-04-22 00:07:54 +02:00
|
|
|
import argparse, logging, threading
|
2023-04-11 22:15:36 +02:00
|
|
|
from lib.WPImport import WPimport
|
|
|
|
from lib.WPExport import WPExport
|
2023-03-28 22:29:55 +02:00
|
|
|
|
|
|
|
|
2023-04-25 16:15:45 +02:00
|
|
|
def download(name_thread, max_thread, url, logger, parser, directory, html, img):
|
2023-04-25 00:34:25 +02:00
|
|
|
|
2023-04-25 16:15:45 +02:00
|
|
|
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
|
2023-04-25 00:34:25 +02:00
|
|
|
|
2023-04-24 23:15:29 +02:00
|
|
|
webpage = exportWp.getUrlPage(name_thread, max_thread)
|
2023-04-22 00:07:54 +02:00
|
|
|
if html is False:
|
|
|
|
exportWp.downloadHTML(webpage)
|
|
|
|
|
|
|
|
if args.img is False:
|
|
|
|
exportWp.downloadImg(webpage)
|
2023-05-01 21:58:47 +02:00
|
|
|
del exportWp
|
2023-04-22 00:07:54 +02:00
|
|
|
|
2023-04-27 00:00:53 +02:00
|
|
|
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
|
|
|
|
canalblog = canalblog.split(",")
|
|
|
|
wordpress = wordpress.split(",")
|
|
|
|
name = "Thread-{0}".format(int(name_thread) + 1)
|
|
|
|
|
|
|
|
if serial is False:
|
|
|
|
for canal in canalblog:
|
|
|
|
try:
|
|
|
|
o = urlparse(canal)
|
|
|
|
o = o._replace(scheme="https")
|
|
|
|
url = o.geturl().replace(":///", "://")
|
|
|
|
except Exception as err:
|
|
|
|
logger.error("{0} : parsing error : {1}".format(name, err))
|
|
|
|
exit(1)
|
|
|
|
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser)
|
|
|
|
webpage = exportWp.getUrlPage(name_thread, max_thread)
|
2023-05-01 21:58:47 +02:00
|
|
|
del exportWp
|
2023-04-27 00:00:53 +02:00
|
|
|
for j in wordpress:
|
|
|
|
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser)
|
|
|
|
importWp.fromUrl(webpage)
|
2023-05-01 21:58:47 +02:00
|
|
|
del importWp
|
2023-04-27 00:00:53 +02:00
|
|
|
else:
|
|
|
|
if len(canalblog) != len(wordpress):
|
|
|
|
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
|
|
|
|
exit(1)
|
|
|
|
for i in range(0, len(canalblog)-1):
|
|
|
|
try:
|
|
|
|
o = urlparse(canalblog[i])
|
|
|
|
o = o._replace(scheme="https")
|
|
|
|
url = o.geturl().replace(":///", "://")
|
|
|
|
except Exception as err:
|
|
|
|
logger.error("parsing error : {0}".format(err))
|
|
|
|
exit(1)
|
|
|
|
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser)
|
|
|
|
webpage = exportWp.getUrlPage(name_thread, max_thread)
|
2023-05-01 21:58:47 +02:00
|
|
|
del exportWp
|
2023-04-27 00:00:53 +02:00
|
|
|
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
|
|
|
|
importWp.fromUrl(webpage)
|
2023-05-01 21:58:47 +02:00
|
|
|
del importWp
|
2023-04-27 00:00:53 +02:00
|
|
|
|
|
|
|
|
2023-04-28 23:37:13 +02:00
|
|
|
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial):
|
2023-04-27 00:00:53 +02:00
|
|
|
name = "Thread-{0}".format(int(name_thread) + 1)
|
2023-05-01 15:45:34 +02:00
|
|
|
directory = directory.split(",")
|
|
|
|
wordpress = wordpress.split(",")
|
2023-04-28 23:37:13 +02:00
|
|
|
if serial is False:
|
|
|
|
for i in wordpress:
|
|
|
|
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser)
|
|
|
|
for j in directory:
|
2023-04-29 22:26:47 +02:00
|
|
|
importWp.fromDirectory(j, name_thread, max_thread)
|
2023-05-01 21:58:47 +02:00
|
|
|
del importWp
|
2023-04-28 23:37:13 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
if len(directory) != len(wordpress):
|
|
|
|
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
|
|
|
|
exit(1)
|
|
|
|
for i in range(0, len(wordpress)-1):
|
|
|
|
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
|
|
|
|
importWp.fromDirectory(directory[i])
|
2023-05-01 21:58:47 +02:00
|
|
|
del importWp
|
2023-04-22 00:07:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2023-03-28 22:29:55 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser()
|
2023-04-08 12:17:43 +02:00
|
|
|
parser.add_argument("--debug", help="Verbosity", action="store_true")
|
|
|
|
parser.add_argument("--logfile", help="Log file", default="")
|
|
|
|
parser.add_argument("--quiet", help="No console output", action="store_true")
|
2023-04-09 21:17:49 +02:00
|
|
|
parser.add_argument("--parser", help="Parser content", default="html.parser")
|
2023-04-24 23:15:29 +02:00
|
|
|
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
|
2023-04-08 12:17:43 +02:00
|
|
|
|
2023-04-08 23:43:06 +02:00
|
|
|
subparsers = parser.add_subparsers(dest="command")
|
2023-04-08 23:20:52 +02:00
|
|
|
|
|
|
|
import_parser = subparsers.add_parser("import")
|
|
|
|
import_parser.add_argument("--user", help="wordpress user", required=True)
|
|
|
|
import_parser.add_argument("--file", help="HTML file", default="")
|
|
|
|
import_parser.add_argument("--directory", help="HTML directory", default="")
|
2023-04-09 22:49:44 +02:00
|
|
|
import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
|
2023-04-08 23:20:52 +02:00
|
|
|
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
|
2023-04-10 16:36:49 +02:00
|
|
|
import_parser.add_argument("--serial", help="Serial execution", action="store_true")
|
|
|
|
|
2023-04-08 23:34:56 +02:00
|
|
|
|
|
|
|
export_parser = subparsers.add_parser("export")
|
|
|
|
|
|
|
|
export_parser.add_argument("--url", help="canblog URL to be scraping", required=True)
|
|
|
|
export_parser.add_argument("--directory",
|
|
|
|
default="backup",
|
|
|
|
help="backup file path")
|
|
|
|
export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
|
|
|
|
export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
|
|
|
|
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
|
|
|
|
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
|
|
|
|
|
2023-04-08 23:20:52 +02:00
|
|
|
|
|
|
|
|
2023-03-28 22:29:55 +02:00
|
|
|
args = parser.parse_args()
|
2023-04-08 23:43:06 +02:00
|
|
|
|
2023-04-10 00:00:01 +02:00
|
|
|
logger = logging.getLogger('import export canalblog')
|
2023-04-08 12:17:43 +02:00
|
|
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
|
if args.quiet is False:
|
|
|
|
ch = logging.StreamHandler()
|
|
|
|
if args.debug is True:
|
|
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
ch.setLevel(logging.DEBUG)
|
|
|
|
else:
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
ch.setLevel(logging.INFO)
|
|
|
|
ch.setFormatter(formatter)
|
|
|
|
logger.addHandler(ch)
|
|
|
|
|
|
|
|
|
|
|
|
if len(args.logfile) > 0:
|
|
|
|
fileHandler = logging.FileHandler(args.logfile)
|
|
|
|
if args.debug is True:
|
|
|
|
fileHandler.setLevel(logging.DEBUG)
|
|
|
|
else:
|
|
|
|
fileHandler.setLevel(logging.INFO)
|
|
|
|
fileHandler.setFormatter(formatter)
|
|
|
|
logger.addHandler(fileHandler)
|
|
|
|
|
2023-04-09 22:49:44 +02:00
|
|
|
if args.command == "import":
|
2023-04-08 23:43:06 +02:00
|
|
|
password = getpass()
|
|
|
|
if len(password) == 0:
|
|
|
|
logger.error("No password error !!! ")
|
|
|
|
exit(1)
|
2023-03-28 22:29:55 +02:00
|
|
|
|
2023-04-08 23:43:06 +02:00
|
|
|
basic = HTTPBasicAuth(args.user, password)
|
2023-04-10 16:15:13 +02:00
|
|
|
wordpress = args.wordpress.split(",")
|
2023-04-11 22:15:36 +02:00
|
|
|
importWp = WPimport(basic, "", logger, args.parser)
|
2023-04-08 23:43:06 +02:00
|
|
|
if len(args.file) > 0:
|
2023-04-10 16:15:13 +02:00
|
|
|
for i in wordpress:
|
|
|
|
importWp.setUrl(i)
|
2023-04-28 23:37:13 +02:00
|
|
|
importWp.fromFile(files=args.file.split(","))
|
2023-04-08 23:43:06 +02:00
|
|
|
if len(args.directory) > 0:
|
2023-04-28 23:37:13 +02:00
|
|
|
try:
|
|
|
|
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
|
|
|
|
wait_for = [
|
2023-04-29 22:26:47 +02:00
|
|
|
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial)
|
2023-04-28 23:37:13 +02:00
|
|
|
for i in range(0, int(args.parallel))
|
|
|
|
]
|
|
|
|
except Exception as err:
|
|
|
|
logger.error("Threading error : {0}".format(err))
|
2023-04-10 00:00:01 +02:00
|
|
|
if len(args.canalblog) > 0:
|
2023-04-27 00:00:53 +02:00
|
|
|
try:
|
|
|
|
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
|
|
|
|
wait_for = [
|
|
|
|
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial)
|
|
|
|
for i in range(0, int(args.parallel))
|
|
|
|
]
|
|
|
|
except Exception as err:
|
|
|
|
logger.error("Threading error : {0}".format(err))
|
2023-05-01 21:18:57 +02:00
|
|
|
exit(0)
|
2023-04-10 16:02:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
if args.command == "export":
|
|
|
|
canalblog = args.url.split(",")
|
2023-04-25 16:15:45 +02:00
|
|
|
exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory)
|
2023-04-10 16:02:40 +02:00
|
|
|
for canal in canalblog:
|
2023-04-10 00:00:01 +02:00
|
|
|
try:
|
2023-04-10 16:02:40 +02:00
|
|
|
o = urlparse(canal)
|
2023-04-10 00:00:01 +02:00
|
|
|
o = o._replace(scheme="https")
|
|
|
|
url = o.geturl().replace(":///", "://")
|
|
|
|
except Exception as err:
|
|
|
|
logger.error("parsing error : {0}".format(err))
|
|
|
|
exit(1)
|
2023-04-10 16:02:40 +02:00
|
|
|
exportWp.setUrl(url)
|
|
|
|
if args.js is False:
|
|
|
|
exportWp.downloadJs()
|
2023-04-09 22:49:44 +02:00
|
|
|
|
2023-04-10 16:02:40 +02:00
|
|
|
if args.css is False:
|
|
|
|
exportWp.downloadCss()
|
2023-05-01 21:58:47 +02:00
|
|
|
del exportWp
|
2023-04-09 22:49:44 +02:00
|
|
|
|
2023-04-10 16:02:40 +02:00
|
|
|
if args.html is False or args.img is False:
|
2023-04-26 23:03:43 +02:00
|
|
|
try:
|
|
|
|
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
|
|
|
|
wait_for = [
|
|
|
|
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img)
|
|
|
|
for i in range(0, int(args.parallel))
|
|
|
|
]
|
|
|
|
except Exception as err:
|
|
|
|
logger.error("Threading error : {0}".format(err))
|
|
|
|
|
|
|
|
|
2023-04-09 22:50:41 +02:00
|
|
|
exit(0)
|