From e42ffd98ae4bc5ca0e3360f944d024a133ed0438 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 2 Mar 2023 23:28:04 +0100 Subject: [PATCH] scrap href all page from gouter --- web_scrap.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 5571050..8ea6977 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -5,25 +5,45 @@ from bs4 import BeautifulSoup import requests -page = requests.get("https://www.clarissariviere.com") +URL = "www.clarissariviere.com" +page = requests.get("https://{0}".format(URL)) + +page_url = [] if page.status_code == 200: soup = BeautifulSoup(page.text, 'html.parser') ul = soup.find_all("ul", id="listsmooth") for anchor in ul[0].find_all("a"): href = anchor.get('href', '/') - if href != "#" and href != "http://www.clarissariviere.com/": - print(href) + if href != "#": + page_url.append(href) - -for i in range(1,100): - paging = i * 10 - page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i)) - soup = BeautifulSoup(page.text, 'html.parser') +for i in page_url: + page = requests.get(i) if page.status_code == 200: - h2 = soup.find_all("h2") - for title in h2: - print(title.find_all("a")[0].get("href", "/")) + print("page : {0}".format(i)) + soup = BeautifulSoup(page.text, 'html.parser') + class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") + if len(class_div) > 0: + pagingfirstline = class_div[0].find_all("a") + if len(pagingfirstline) > 1: + lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") + element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] + number_page = element_lastpage.split("-")[0].split("p")[1] + number_lastpage = int(number_page) / 10 + for j in range(1,int(number_lastpage)): + paging = j * 10 + categorie = i.split("/") + url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging) + if len(categorie) != 4: + url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[4], paging) + print(url_paging) + page = requests.get(url_paging) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + h2 = soup.find_all("h2") + for title in h2: + print(title.find_all("a")[0].get("href", "/"))