web_scrap/import_export_canalblog.py

331 lines
15 KiB
Python
Raw Normal View History

2023-03-23 23:28:57 +01:00
#!/usr/bin/python3
from requests.auth import HTTPBasicAuth
from getpass import getpass
2023-04-09 23:49:10 +02:00
from urllib.parse import urlparse
2023-04-25 00:34:25 +02:00
from concurrent import futures
2023-05-29 23:36:11 +02:00
from concurrent.futures import as_completed, wait, ALL_COMPLETED
2023-04-25 00:34:25 +02:00
2023-06-19 23:58:59 +02:00
import argparse, logging, threading, os, glob
2023-04-11 22:15:36 +02:00
from lib.WPImport import WPimport
from lib.WPExport import WPExport
2023-05-15 23:42:18 +02:00
from lib.WPRemove import WPRemove
2023-06-10 01:58:08 +02:00
from lib.WPChange import WPChange
def change(index, number, args, logger):
changeWp = WPChange(logger=logger, index_name=index, number_thread=number)
changeWp.fromDirectory(args.directory)
del changeWp
2023-05-15 23:42:18 +02:00
2023-05-27 00:06:11 +02:00
def remove(index, number, args, basic, logger, ssl_wordpress):
removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number)
2023-05-15 23:51:45 +02:00
if args.remove == True:
2023-05-15 23:42:18 +02:00
for i in args.wordpress.split(","):
removeWp.setUrl(i)
removeWp.cleanPosts()
removeWp.cleanTags()
removeWp.cleanCategories()
removeWp.cleanMedia()
else:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
2023-05-15 23:51:45 +02:00
if args.posts == True:
2023-05-15 23:42:18 +02:00
removeWp.cleanPosts()
2023-05-15 23:51:45 +02:00
if args.categories == True:
2023-05-15 23:42:18 +02:00
removeWp.cleanCategories()
2023-05-15 23:51:45 +02:00
if args.tags == True:
2023-05-15 23:42:18 +02:00
removeWp.cleanTags()
2023-05-15 23:51:45 +02:00
if args.media == True:
2023-05-15 23:42:18 +02:00
removeWp.cleanMedia()
del removeWp
2023-03-28 22:29:55 +02:00
2023-05-07 09:26:48 +02:00
2023-06-19 23:58:59 +02:00
def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp):
2023-05-25 00:31:34 +02:00
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog)
2023-06-19 23:58:59 +02:00
if not revert:
2023-06-20 00:17:38 +02:00
exportWp.getUrlPage(name_thread, max_thread)
2023-05-07 17:38:44 +02:00
for i in ["article", "page"]:
for j in ["publications", "principal"]:
if html is False:
2023-06-20 00:17:38 +02:00
exportWp.downloadHTML(j, i)
2023-05-07 09:26:48 +02:00
2023-05-07 17:38:44 +02:00
if img is False:
2023-06-20 00:17:38 +02:00
exportWp.downloadImg(j, i)
del exportWp
2023-05-07 09:26:48 +02:00
2023-04-22 00:07:54 +02:00
2023-06-19 23:58:59 +02:00
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
2023-04-27 00:00:53 +02:00
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
2023-05-25 00:31:34 +02:00
protocol = "https"
if ssl_canalblog is False:
protocol = "http"
2023-04-27 00:00:53 +02:00
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
2023-05-25 00:31:34 +02:00
o = o._replace(scheme=protocol)
2023-04-27 00:00:53 +02:00
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
2023-06-20 00:17:38 +02:00
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp)
2023-06-19 23:58:59 +02:00
if not revert:
2023-06-20 00:17:38 +02:00
exportWp.getUrlPage(name_thread, max_thread)
2023-05-01 21:58:47 +02:00
del exportWp
2023-04-27 00:00:53 +02:00
for j in wordpress:
2023-06-20 00:17:38 +02:00
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
2023-05-07 17:38:44 +02:00
for k in ["article", "page"]:
for l in ["publications", "principal"]:
2023-06-20 00:17:38 +02:00
importWp.fromUrl(l, k)
2023-05-07 17:38:44 +02:00
2023-05-01 21:58:47 +02:00
del importWp
2023-04-27 00:00:53 +02:00
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
2023-05-25 00:31:34 +02:00
o = o._replace(scheme=protocol)
2023-04-27 00:00:53 +02:00
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
2023-05-25 00:31:34 +02:00
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
2023-06-19 23:58:59 +02:00
if not revert:
2023-06-20 00:17:38 +02:00
exportWp.getUrlPage(name_thread, max_thread)
2023-05-01 21:58:47 +02:00
del exportWp
2023-06-20 00:17:38 +02:00
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
2023-05-07 17:38:44 +02:00
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k])
2023-05-01 21:58:47 +02:00
del importWp
2023-04-27 00:00:53 +02:00
2023-06-26 22:44:42 +02:00
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image):
2023-04-27 00:00:53 +02:00
name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",")
wordpress = wordpress.split(",")
2023-04-28 23:37:13 +02:00
if serial is False:
for i in wordpress:
2023-06-26 22:44:42 +02:00
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
2023-04-28 23:37:13 +02:00
for j in directory:
2023-04-29 22:26:47 +02:00
importWp.fromDirectory(j, name_thread, max_thread)
2023-05-01 21:58:47 +02:00
del importWp
2023-04-28 23:37:13 +02:00
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
2023-06-26 22:44:42 +02:00
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
2023-04-28 23:37:13 +02:00
importWp.fromDirectory(directory[i])
2023-05-01 21:58:47 +02:00
del importWp
2023-04-22 00:07:54 +02:00
2023-03-28 22:29:55 +02:00
if __name__ == '__main__':
2023-06-19 23:58:59 +02:00
TMP = "/tmp/import_export_canablog"
2023-03-28 22:29:55 +02:00
parser = argparse.ArgumentParser()
2023-04-08 12:17:43 +02:00
parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true")
2023-04-09 21:17:49 +02:00
parser.add_argument("--parser", help="Parser content", default="html.parser")
2023-04-24 23:15:29 +02:00
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
2023-05-26 10:04:36 +02:00
parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="")
2023-06-19 23:58:59 +02:00
parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true")
2023-04-08 12:17:43 +02:00
2023-04-08 23:43:06 +02:00
subparsers = parser.add_subparsers(dest="command")
2023-04-08 23:20:52 +02:00
import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--password", help="password wordpress's user", default="")
2023-04-08 23:20:52 +02:00
import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="")
2023-04-09 22:49:44 +02:00
import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
2023-04-08 23:20:52 +02:00
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
2023-04-10 16:36:49 +02:00
import_parser.add_argument("--serial", help="Serial execution", action="store_true")
2023-05-15 23:22:41 +02:00
import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true")
2023-05-15 23:51:45 +02:00
import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true")
import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true")
import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true")
import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true")
2023-06-03 09:07:33 +02:00
import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true")
import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true")
import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true")
2023-05-15 23:21:25 +02:00
remove_parser = subparsers.add_parser("remove")
remove_parser.add_argument("--user", help="wordpress user", required=True)
remove_parser.add_argument("--password", help="password wordpress's user", default="")
remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
2023-05-15 23:22:41 +02:00
remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true")
2023-05-15 23:21:25 +02:00
remove_parser.add_argument("--posts", help="Remove all posts", action="store_true")
remove_parser.add_argument("--categories", help="Remove all categories", action="store_true")
remove_parser.add_argument("--tags", help="Remove all tags", action="store_true")
remove_parser.add_argument("--media", help="Remove all media", action="store_true")
2023-04-10 16:36:49 +02:00
2023-04-08 23:34:56 +02:00
2023-04-08 23:34:56 +02:00
export_parser = subparsers.add_parser("export")
export_parser.add_argument("--url", help="canblog URL to be scraping", required=True)
export_parser.add_argument("--directory",
default="backup",
help="backup file path")
export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
2023-06-10 01:58:08 +02:00
change_parser = subparsers.add_parser("change")
change_parser.add_argument("--directory",
default="",
help="Directory")
change_parser.add_argument("--file",
default="",
help="File")
2023-06-19 23:58:59 +02:00
if not os.path.exists(TMP):
os.mkdir(TMP)
2023-03-28 22:29:55 +02:00
args = parser.parse_args()
2023-04-08 23:43:06 +02:00
2023-04-10 00:00:01 +02:00
logger = logging.getLogger('import export canalblog')
2023-04-08 12:17:43 +02:00
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
2023-05-25 00:31:34 +02:00
ssl_canalblog = True
ssl_wordpress = True
for i in args.ssl.split(","):
if i == "canalblog":
ssl_canalblog = False
if i == "wordpress":
ssl_wordpress = False
2023-04-08 12:17:43 +02:00
if args.quiet is False:
ch = logging.StreamHandler()
if args.debug is True:
logger.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
if len(args.logfile) > 0:
fileHandler = logging.FileHandler(args.logfile)
if args.debug is True:
fileHandler.setLevel(logging.DEBUG)
else:
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)
2023-05-15 23:51:45 +02:00
if args.command == "import" or args.command == "remove":
password = args.password
if len(args.password) == 0:
password = getpass()
if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
2023-03-28 22:29:55 +02:00
2023-04-08 23:43:06 +02:00
basic = HTTPBasicAuth(args.user, password)
2023-05-15 23:51:45 +02:00
if args.command == "import":
2023-04-10 16:15:13 +02:00
wordpress = args.wordpress.split(",")
2023-05-25 00:31:34 +02:00
importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress)
2023-04-08 23:43:06 +02:00
if len(args.file) > 0:
2023-04-10 16:15:13 +02:00
for i in wordpress:
importWp.setUrl(i)
2023-04-28 23:37:13 +02:00
importWp.fromFile(files=args.file.split(","))
2023-04-08 23:43:06 +02:00
if len(args.directory) > 0:
2023-04-28 23:37:13 +02:00
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
2023-05-29 23:36:11 +02:00
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED)
2023-04-28 23:37:13 +02:00
wait_for = [
2023-06-25 21:16:05 +02:00
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, TMP, revert)
2023-04-28 23:37:13 +02:00
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
2023-04-10 00:00:01 +02:00
if len(args.canalblog) > 0:
2023-04-27 00:00:53 +02:00
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
2023-05-29 23:36:11 +02:00
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED)
2023-04-27 00:00:53 +02:00
wait_for = [
2023-06-19 23:58:59 +02:00
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, TMP)
2023-04-27 00:00:53 +02:00
for i in range(0, int(args.parallel))
]
2023-05-29 23:36:11 +02:00
2023-04-27 00:00:53 +02:00
except Exception as err:
logger.error("Threading error : {0}".format(err))
2023-05-01 21:18:57 +02:00
exit(0)
2023-04-10 16:02:40 +02:00
if args.command == "export":
canalblog = args.url.split(",")
2023-05-25 00:31:34 +02:00
protocol = "https"
if ssl_canalblog is False:
protocol = "http"
exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory, ssl_canalblog=ssl_canalblog)
2023-04-10 16:02:40 +02:00
for canal in canalblog:
2023-04-10 00:00:01 +02:00
try:
2023-04-10 16:02:40 +02:00
o = urlparse(canal)
2023-05-25 00:31:34 +02:00
o = o._replace(scheme=protocol)
2023-04-10 00:00:01 +02:00
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
2023-04-10 16:02:40 +02:00
exportWp.setUrl(url)
if args.js is False:
exportWp.downloadJs()
2023-04-09 22:49:44 +02:00
2023-04-10 16:02:40 +02:00
if args.css is False:
exportWp.downloadCss()
2023-05-01 21:58:47 +02:00
del exportWp
2023-05-07 09:26:48 +02:00
2023-04-09 22:49:44 +02:00
2023-04-10 16:02:40 +02:00
if args.html is False or args.img is False:
2023-04-26 23:03:43 +02:00
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
2023-06-19 23:58:59 +02:00
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, TMP)
2023-04-26 23:03:43 +02:00
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
2023-05-15 23:51:45 +02:00
exit(0)
2023-04-26 23:03:43 +02:00
2023-05-15 23:42:18 +02:00
2023-05-15 23:51:45 +02:00
if args.command == "remove":
2023-05-27 00:06:11 +02:00
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
except Exception as err:
logger.error("Thread error for remove : {0}".format(err))
2023-06-10 01:58:08 +02:00
exit(0)
if args.command == "change":
if len(args.directory) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(change, i, args.parallel, args, logger) for i in range(0, int(args.parallel)) ]
except Exception as err:
logger.error("Thread error for remove : {0}".format(err))
if len(args.file) > 0:
changeWp = WPChange(logger=logger)
for filei in args.file.split(","):
changeWp.fromFile(filei)
2023-04-09 22:50:41 +02:00
exit(0)