2023-02-28 21:42:21 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
|
|
|
|
# Python 3
|
|
|
|
# Extraction des liens d'une page web
|
|
|
|
from bs4 import BeautifulSoup
|
2023-02-28 21:52:12 +01:00
|
|
|
import requests
|
2023-02-28 21:42:21 +01:00
|
|
|
|
2023-02-28 21:52:12 +01:00
|
|
|
page = requests.get("https://www.clarissariviere.com")
|
|
|
|
|
|
|
|
if page.status_code == 200:
|
|
|
|
soup = BeautifulSoup(page.text, 'html.parser')
|
2023-02-28 22:03:03 +01:00
|
|
|
ul = soup.find_all("ul", id="listsmooth")
|
|
|
|
for anchor in ul[0].find_all("a"):
|
|
|
|
href = anchor.get('href', '/')
|
|
|
|
if href != "#" and href != "http://www.clarissariviere.com/":
|
|
|
|
print(href)
|
2023-02-28 22:24:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
for i in range(1,100):
|
|
|
|
paging = i * 10
|
|
|
|
page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i))
|
|
|
|
soup = BeautifulSoup(page.text, 'html.parser')
|
|
|
|
if page.status_code == 200:
|
|
|
|
h2 = soup.find_all("h2")
|
|
|
|
for title in h2:
|
|
|
|
print(title.find_all("a")[0].get("href", "/"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|