create dir for every path
This commit is contained in:
parent
3c76cab9a7
commit
6794f77df2
31
web_scrap.py
31
web_scrap.py
@ -1,10 +1,23 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
# Python 3
|
# Python 3
|
||||||
# Extraction des liens d'une page web
|
# Extraction des liens d'une page web
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import requests
|
import requests, os
|
||||||
|
|
||||||
|
def mkdir_path(path_dir):
|
||||||
|
if not os.path.exists(path_dir):
|
||||||
|
makedir = []
|
||||||
|
pathh = path_dir.split("/")
|
||||||
|
for i in pathh:
|
||||||
|
makedir.append(i)
|
||||||
|
repath = "/".join(makedir)
|
||||||
|
if not os.path.exists(repath):
|
||||||
|
os.mkdir(repath)
|
||||||
|
|
||||||
|
BACKUP_DIR = "backup"
|
||||||
|
|
||||||
|
mkdir_path(BACKUP_DIR)
|
||||||
|
|
||||||
URL = "www.clarissariviere.com"
|
URL = "www.clarissariviere.com"
|
||||||
|
|
||||||
@ -22,8 +35,6 @@ if page.status_code == 200:
|
|||||||
webpage = []
|
webpage = []
|
||||||
for i in page_url:
|
for i in page_url:
|
||||||
page = requests.get(i)
|
page = requests.get(i)
|
||||||
o = urlparse(i)
|
|
||||||
print(o.path)
|
|
||||||
if page.status_code == 200:
|
if page.status_code == 200:
|
||||||
print("page : {0}".format(i))
|
print("page : {0}".format(i))
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, 'html.parser')
|
||||||
@ -51,9 +62,9 @@ for i in page_url:
|
|||||||
if href not in webpage:
|
if href not in webpage:
|
||||||
webpage.append(href)
|
webpage.append(href)
|
||||||
|
|
||||||
print(webpage)
|
for i in webpage:
|
||||||
|
o = urlparse(i)
|
||||||
|
path_web = o.path.split("/")
|
||||||
|
path_web.pop(len(path_web)-1)
|
||||||
|
dir_page_web = "/".join(path_web)
|
||||||
|
mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web))
|
Loading…
x
Reference in New Issue
Block a user