web_scrap/import_export_canalblog.py

98 lines
3.6 KiB
Python
Raw Normal View History

2023-03-23 23:28:57 +01:00
#!/usr/bin/python3
from requests.auth import HTTPBasicAuth
from getpass import getpass
2023-04-08 12:27:30 +02:00
import argparse, logging
import WPImport
2023-03-28 22:29:55 +02:00
if __name__ == '__main__':
parser = argparse.ArgumentParser()
2023-04-08 12:17:43 +02:00
parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true")
2023-04-09 21:17:49 +02:00
parser.add_argument("--parser", help="Parser content", default="html.parser")
2023-04-08 12:17:43 +02:00
2023-04-08 23:43:06 +02:00
subparsers = parser.add_subparsers(dest="command")
2023-04-08 23:20:52 +02:00
import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="")
2023-04-09 22:49:44 +02:00
import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
2023-04-08 23:20:52 +02:00
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
2023-04-08 23:34:56 +02:00
export_parser = subparsers.add_parser("export")
export_parser.add_argument("--url", help="canblog URL to be scraping", required=True)
export_parser.add_argument("--directory",
default="backup",
help="backup file path")
export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
2023-04-08 23:20:52 +02:00
2023-03-28 22:29:55 +02:00
args = parser.parse_args()
2023-04-08 23:43:06 +02:00
2023-04-08 12:17:43 +02:00
logger = logging.getLogger('insert wordpress')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.quiet is False:
ch = logging.StreamHandler()
if args.debug is True:
logger.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
if len(args.logfile) > 0:
fileHandler = logging.FileHandler(args.logfile)
if args.debug is True:
fileHandler.setLevel(logging.DEBUG)
else:
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)
2023-04-09 22:49:44 +02:00
if args.command == "import":
2023-04-08 23:43:06 +02:00
password = getpass()
if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
2023-03-28 22:29:55 +02:00
2023-04-08 23:43:06 +02:00
basic = HTTPBasicAuth(args.user, password)
2023-04-09 21:17:49 +02:00
importWp = WPImport.WPimport(basic, args.wordpress, logger, args.parser)
2023-04-08 23:43:06 +02:00
if len(args.file) > 0:
importWp.fromFile(args.file.split(","))
exit(0)
if len(args.directory) > 0:
2023-04-09 22:49:44 +02:00
importWp.fromDirectory(args.directory)
exit(0)
if args.command == "export":
try:
o = urlparse(args.url)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport.WPExport(url, logger, args.parser, args.dir)
if args.js is False:
exportWp.downloadJs()
if args.css is False:
exportWp.downloadCss()
if args.html is False or args.img is False:
webpage = exportWp.getUrlPage()
if args.html is False:
exportWp.downloadHTML(webpage)
if args.img is False:
exportWp.downloadImg(webpage)