From 6794f77df2da4e63dcabc8b75071c151286c38ff Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 4 Mar 2023 18:35:06 +0100 Subject: [PATCH] create dir for every path --- web_scrap.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index caf2537..c381606 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -1,10 +1,23 @@ #!/usr/bin/python3 - # Python 3 # Extraction des liens d'une page web from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests +import requests, os + +def mkdir_path(path_dir): + if not os.path.exists(path_dir): + makedir = [] + pathh = path_dir.split("/") + for i in pathh: + makedir.append(i) + repath = "/".join(makedir) + if not os.path.exists(repath): + os.mkdir(repath) + +BACKUP_DIR = "backup" + +mkdir_path(BACKUP_DIR) URL = "www.clarissariviere.com" @@ -22,8 +35,6 @@ if page.status_code == 200: webpage = [] for i in page_url: page = requests.get(i) - o = urlparse(i) - print(o.path) if page.status_code == 200: print("page : {0}".format(i)) soup = BeautifulSoup(page.text, 'html.parser') @@ -51,9 +62,9 @@ for i in page_url: if href not in webpage: webpage.append(href) -print(webpage) - - - - - +for i in webpage: + o = urlparse(i) + path_web = o.path.split("/") + path_web.pop(len(path_web)-1) + dir_page_web = "/".join(path_web) + mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web)) \ No newline at end of file