diff --git a/web_scrap.py b/web_scrap.py index 18e38c5..fb8fc32 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -1,13 +1,7 @@ #!/usr/bin/python3 -# Python 3 -# Extraction des liens d'une page web from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests, os - -BACKUP_DIR = "backup" -URL = "www.clarissariviere.com" - +import requests, os, argparse def mkdir_path(path_dir): if not os.path.exists(path_dir): @@ -20,10 +14,7 @@ def mkdir_path(path_dir): os.mkdir(repath) - - def getUrlPage(url): - print(url) page = requests.get(url) page_url = [] if page.status_code == 200: @@ -69,21 +60,35 @@ def getUrlPage(url): o = urlparse(href) o = o._replace(scheme="https").geturl() webpage.append(o) - return webpage + return webpage -def downloadPage(url): +def downloadPage(url, backup_dir): o = urlparse(url) o = o._replace(scheme="https") - o = o._replace(fragment="") webpage = getUrlPage(o.geturl().replace(":///", "://")) - for i in webpage: - o = urlparse(i) + for i in range(0, len(webpage)): + o = urlparse(webpage[i]) path_web = o.path.split("/") + filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web)) + mkdir_path("{0}/{1}".format(backup_dir, dir_page_web)) + r = requests.get(webpage[i]) + if r.status_code == 200: + fileDownload = "{0}/index.html".format(backup_dir) + if len(dir_page_web) > 0 and len(filePageWeb) > 0: + fileDownload = "{0}/{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) + print("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + open(fileDownload, "wb").write(r.content) if __name__ == '__main__': - downloadPage(URL) \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--url", help="canblog URL to be scraping", required=True) + parser.add_argument("--dir", + default="backup", + help="backup file path") + parser.add_argument("--verbosity", help="Verbosity", action="store_false") + args = parser.parse_args() + downloadPage(args.url, args.dir) \ No newline at end of file