From 3c76cab9a7566c46021846b7351bed829f1f5915 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 4 Mar 2023 16:12:42 +0100 Subject: [PATCH] add urlparse --- web_scrap.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index a96d90f..caf2537 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -3,6 +3,7 @@ # Python 3 # Extraction des liens d'une page web from bs4 import BeautifulSoup +from urllib.parse import urlparse import requests URL = "www.clarissariviere.com" @@ -21,6 +22,8 @@ if page.status_code == 200: webpage = [] for i in page_url: page = requests.get(i) + o = urlparse(i) + print(o.path) if page.status_code == 200: print("page : {0}".format(i)) soup = BeautifulSoup(page.text, 'html.parser') @@ -34,10 +37,10 @@ for i in page_url: number_lastpage = int(number_page) / 10 for j in range(1,int(number_lastpage)): paging = j * 10 - categorie = i.split("/") + categorie = urlparse(i).path.split("/") url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging) - if len(categorie) != 4: - url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[4], paging) + if len(categorie) > 2: + url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[2], paging) print(url_paging) page = requests.get(url_paging) if page.status_code == 200: