From e42ffd98ae4bc5ca0e3360f944d024a133ed0438 Mon Sep 17 00:00:00 2001
From: Valentin CZERYBA <valcze80@gmail.com>
Date: Thu, 2 Mar 2023 23:28:04 +0100
Subject: [PATCH] scrap href all page from gouter

---
 web_scrap.py | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/web_scrap.py b/web_scrap.py
index 5571050..8ea6977 100644
--- a/web_scrap.py
+++ b/web_scrap.py
@@ -5,25 +5,45 @@
 from bs4 import BeautifulSoup
 import requests
 
-page = requests.get("https://www.clarissariviere.com")
+URL = "www.clarissariviere.com"
 
+page = requests.get("https://{0}".format(URL))
+
+page_url = []
 if page.status_code == 200:
     soup = BeautifulSoup(page.text, 'html.parser')
     ul = soup.find_all("ul", id="listsmooth")
     for anchor in ul[0].find_all("a"):
         href = anchor.get('href', '/')
-        if href != "#" and href != "http://www.clarissariviere.com/":
-            print(href)
+        if href != "#":
+            page_url.append(href)
 
-
-for i in range(1,100):
-    paging = i * 10
-    page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i))
-    soup = BeautifulSoup(page.text, 'html.parser')
+for i in page_url:
+    page = requests.get(i)
     if page.status_code == 200:
-        h2 = soup.find_all("h2")
-        for title in h2:
-            print(title.find_all("a")[0].get("href", "/"))
+        print("page : {0}".format(i))
+        soup = BeautifulSoup(page.text, 'html.parser')
+        class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
+        if len(class_div) > 0:
+            pagingfirstline = class_div[0].find_all("a")
+            if len(pagingfirstline) > 1:
+                lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/")
+                element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
+                number_page = element_lastpage.split("-")[0].split("p")[1]
+                number_lastpage = int(number_page) / 10
+                for j in range(1,int(number_lastpage)):
+                    paging = j * 10
+                    categorie = i.split("/")
+                    url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging)
+                    if len(categorie) != 4:
+                        url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[4], paging)
+                    print(url_paging)
+                    page = requests.get(url_paging)
+                    if page.status_code == 200:
+                        soup = BeautifulSoup(page.text, 'html.parser')
+                        h2 = soup.find_all("h2")
+                        for title in h2:
+                            print(title.find_all("a")[0].get("href", "/"))