From 6794f77df2da4e63dcabc8b75071c151286c38ff Mon Sep 17 00:00:00 2001
From: Valentin CZERYBA <valcze80@gmail.com>
Date: Sat, 4 Mar 2023 18:35:06 +0100
Subject: [PATCH] create dir for every path

---
 web_scrap.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/web_scrap.py b/web_scrap.py
index caf2537..c381606 100644
--- a/web_scrap.py
+++ b/web_scrap.py
@@ -1,10 +1,23 @@
 #!/usr/bin/python3
-
 # Python 3
 # Extraction des liens d'une page web
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
-import requests
+import requests, os
+
+def mkdir_path(path_dir):
+    if not os.path.exists(path_dir):
+        makedir = []
+        pathh = path_dir.split("/")
+        for i in pathh:
+            makedir.append(i)
+            repath = "/".join(makedir)
+            if not os.path.exists(repath):
+                os.mkdir(repath)
+
+BACKUP_DIR = "backup"
+
+mkdir_path(BACKUP_DIR)
 
 URL = "www.clarissariviere.com"
 
@@ -22,8 +35,6 @@ if page.status_code == 200:
 webpage = []
 for i in page_url:
     page = requests.get(i)
-    o = urlparse(i)
-    print(o.path)
     if page.status_code == 200:
         print("page : {0}".format(i))
         soup = BeautifulSoup(page.text, 'html.parser')
@@ -51,9 +62,9 @@ for i in page_url:
                             if href not in webpage:
                                 webpage.append(href)
 
-print(webpage)
-            
-
-
-    
-
+for i in webpage:
+    o = urlparse(i)
+    path_web = o.path.split("/")
+    path_web.pop(len(path_web)-1)
+    dir_page_web = "/".join(path_web)
+    mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web))
\ No newline at end of file