web_scrap/web_scrap.py

#!/usr/bin/python3

# Python 3
# Extraction des liens d'une page web
from bs4 import BeautifulSoup
import requests

page = requests.get("https://www.clarissariviere.com")

if page.status_code == 200:
    soup = BeautifulSoup(page.text, 'html.parser')
    ul = soup.find_all("ul", id="listsmooth")
    for anchor in ul[0].find_all("a"):
        href = anchor.get('href', '/')
        if href != "#" and href != "http://www.clarissariviere.com/":
            print(href)


for i in range(1,100):
    paging = i * 10
    page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i))
    soup = BeautifulSoup(page.text, 'html.parser')
    if page.status_code == 200:
        h2 = soup.find_all("h2")
        for title in h2:
            print(title.find_all("a")[0].get("href", "/"))
first init 2023-02-28 21:42:21 +01:00			`#!/usr/bin/python3`

			`# Python 3`
			`# Extraction des liens d'une page web`
			`from bs4 import BeautifulSoup`
test webscrapping 2023-02-28 21:52:12 +01:00			`import requests`
first init 2023-02-28 21:42:21 +01:00
test webscrapping 2023-02-28 21:52:12 +01:00			`page = requests.get("https://www.clarissariviere.com")`

			`if page.status_code == 200:`
			`soup = BeautifulSoup(page.text, 'html.parser')`
menu nav list 2023-02-28 22:03:03 +01:00			`ul = soup.find_all("ul", id="listsmooth")`
			`for anchor in ul[0].find_all("a"):`
			`href = anchor.get('href', '/')`
			`if href != "#" and href != "http://www.clarissariviere.com/":`
			`print(href)`
get href article archive 2023-02-28 22:24:16 +01:00

			`for i in range(1,100):`
			`paging = i * 10`
			`page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i))`
			`soup = BeautifulSoup(page.text, 'html.parser')`
			`if page.status_code == 200:`
			`h2 = soup.find_all("h2")`
			`for title in h2:`
			`print(title.find_all("a")[0].get("href", "/"))`