add urlparse

2023-03-04 16:12:42 +01:00 · 2023-03-04 16:12:42 +01:00 · 3c76cab9a7
commit 3c76cab9a7
parent d21af4f60a
1 changed files with 6 additions and 3 deletions
--- a/web_scrap.py
+++ b/web_scrap.py
@ -3,6 +3,7 @@
 # Python 3
 # Extraction des liens d'une page web
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
 import requests

 URL = "www.clarissariviere.com"
@ -21,6 +22,8 @@ if page.status_code == 200:
 webpage = []
 for i in page_url:
    page = requests.get(i)
+    o = urlparse(i)
+    print(o.path)
    if page.status_code == 200:
        print("page : {0}".format(i))
        soup = BeautifulSoup(page.text, 'html.parser')
@ -34,10 +37,10 @@ for i in page_url:
                number_lastpage = int(number_page) / 10
                for j in range(1,int(number_lastpage)):
                    paging = j * 10
-                    categorie = i.split("/")
+                    categorie = urlparse(i).path.split("/")
                    url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging)
-                    if len(categorie) != 4:
-                        url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[4], paging)
+                    if len(categorie) > 2:
+                        url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[2], paging)
                    print(url_paging)
                    page = requests.get(url_paging)
                    if page.status_code == 200: