diff --git a/web_scrap.py b/web_scrap.py index fb8fc32..d08dfbd 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -1,9 +1,9 @@ #!/usr/bin/python3 from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests, os, argparse +import requests, os, argparse, logging -def mkdir_path(path_dir): +def mkdir_path(path_dir, logger): if not os.path.exists(path_dir): makedir = [] pathh = path_dir.split("/") @@ -11,10 +11,11 @@ def mkdir_path(path_dir): makedir.append(i) repath = "/".join(makedir) if not os.path.exists(repath): + logger.debug("Dossier crée : {0}".format(repath)) os.mkdir(repath) -def getUrlPage(url): +def getUrlPage(url, logger): page = requests.get(url) page_url = [] if page.status_code == 200: @@ -29,7 +30,7 @@ def getUrlPage(url): for i in page_url: page = requests.get(i) if page.status_code == 200: - print("page : {0}".format(i)) + logger.info("page : {0}".format(i)) if i not in webpage: webpage.append(i) soup = BeautifulSoup(page.text, 'html.parser') @@ -47,7 +48,7 @@ def getUrlPage(url): url_paging = "{0}/archives/p{1}-10.html".format(url, paging) if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(url, categorie[2], paging) - print(url_paging) + logger.info(url_paging) if url_paging not in webpage: webpage.append(url_paging) page = requests.get(url_paging) @@ -63,23 +64,23 @@ def getUrlPage(url): return webpage -def downloadPage(url, backup_dir): +def downloadPage(url, backup_dir, logger): o = urlparse(url) o = o._replace(scheme="https") - webpage = getUrlPage(o.geturl().replace(":///", "://")) + webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) for i in range(0, len(webpage)): o = urlparse(webpage[i]) path_web = o.path.split("/") filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(backup_dir, dir_page_web)) + mkdir_path("{0}/{1}".format(backup_dir, dir_page_web), logger) r = requests.get(webpage[i]) if r.status_code == 200: fileDownload = "{0}/index.html".format(backup_dir) if len(dir_page_web) > 0 and len(filePageWeb) > 0: fileDownload = "{0}/{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) - print("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + logger.info("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) @@ -89,6 +90,20 @@ if __name__ == '__main__': parser.add_argument("--dir", default="backup", help="backup file path") - parser.add_argument("--verbosity", help="Verbosity", action="store_false") + parser.add_argument("--debug", help="Verbosity", action="store_true") args = parser.parse_args() - downloadPage(args.url, args.dir) \ No newline at end of file + logger = logging.getLogger('web_scrap') + ch = logging.StreamHandler() + + if args.debug is not None: + logger.setLevel(logging.DEBUG) + ch.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + ch.setLevel(logging.INFO) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + + downloadPage(args.url, args.dir, logger) \ No newline at end of file