web_scrap/import_export_canalblog.py

202 lines
8.3 KiB
Python

#!/usr/bin/python3
from requests.auth import HTTPBasicAuth
from getpass import getpass
from urllib.parse import urlparse
from concurrent import futures
import argparse, logging, threading
from lib.WPImport import WPimport
from lib.WPExport import WPExport
def download(name_thread, max_thread, url, logger, parser, directory, html, img):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
webpage = exportWp.getUrlPage(name_thread, max_thread)
if html is False:
exportWp.downloadHTML(webpage)
if args.img is False:
exportWp.downloadImg(webpage)
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromUrl(webpage)
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial):
name = "Thread-{0}".format(int(name_thread) + 1)
directory = args.directory.split(",")
if serial is False:
for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser)
for j in directory:
importWp.fromDirectory(j)
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromDirectory(directory[i])
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
subparsers = parser.add_subparsers(dest="command")
import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="")
import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
import_parser.add_argument("--serial", help="Serial execution", action="store_true")
export_parser = subparsers.add_parser("export")
export_parser.add_argument("--url", help="canblog URL to be scraping", required=True)
export_parser.add_argument("--directory",
default="backup",
help="backup file path")
export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
args = parser.parse_args()
logger = logging.getLogger('import export canalblog')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.quiet is False:
ch = logging.StreamHandler()
if args.debug is True:
logger.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
if len(args.logfile) > 0:
fileHandler = logging.FileHandler(args.logfile)
if args.debug is True:
fileHandler.setLevel(logging.DEBUG)
else:
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)
if args.command == "import":
password = getpass()
if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
basic = HTTPBasicAuth(args.user, password)
wordpress = args.wordpress.split(",")
importWp = WPimport(basic, "", logger, args.parser)
if len(args.file) > 0:
for i in wordpress:
importWp.setUrl(i)
importWp.fromFile(files=args.file.split(","))
exit(0)
if len(args.directory) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(importDi, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)
if len(args.canalblog) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
if args.command == "export":
canalblog = args.url.split(",")
exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory)
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
if args.js is False:
exportWp.downloadJs()
if args.css is False:
exportWp.downloadCss()
if args.html is False or args.img is False:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)