From 64118a3c2074f7e05a114887d2990df73a216e3c Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 28 Feb 2023 21:52:12 +0100 Subject: [PATCH 01/11] test webscrapping --- web_scrap.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 06ed902..f5a7ccf 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -3,10 +3,17 @@ # Python 3 # Extraction des liens d'une page web from bs4 import BeautifulSoup -import urllib.request +import requests -with urllib.request.urlopen('https://www.clarissariviere.com/') as response: - webpage = response.read() - soup = BeautifulSoup(webpage, 'html.parser') +page = requests.get("https://www.clarissariviere.com") + +if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') for anchor in soup.find_all('a'): - print(anchor.get('href', '/')) \ No newline at end of file + print(anchor.get('href', '/')) +#with urllib.request.urlopen('https://www.clarissariviere.com/index.html') as response: +# print(response) + #webpage = response.read() + #soup = BeautifulSoup(webpage, 'html.parser') + #for anchor in soup.find_all('a'): + # print(anchor.get('href', '/')) \ No newline at end of file From a03489ee2141b2584c07d0505254791c5e962ce7 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 28 Feb 2023 22:03:03 +0100 Subject: [PATCH 02/11] menu nav list --- web_scrap.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index f5a7ccf..dff226d 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -9,11 +9,8 @@ page = requests.get("https://www.clarissariviere.com") if page.status_code == 200: soup = BeautifulSoup(page.text, 'html.parser') - for anchor in soup.find_all('a'): - print(anchor.get('href', '/')) -#with urllib.request.urlopen('https://www.clarissariviere.com/index.html') as response: -# print(response) - #webpage = response.read() - #soup = BeautifulSoup(webpage, 'html.parser') - #for anchor in soup.find_all('a'): - # print(anchor.get('href', '/')) \ No newline at end of file + ul = soup.find_all("ul", id="listsmooth") + for anchor in ul[0].find_all("a"): + href = anchor.get('href', '/') + if href != "#" and href != "http://www.clarissariviere.com/": + print(href) From 991590f8083353fba9d098392d5c65a1c3f19670 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 28 Feb 2023 22:24:16 +0100 Subject: [PATCH 03/11] get href article archive --- web_scrap.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/web_scrap.py b/web_scrap.py index dff226d..5571050 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -14,3 +14,18 @@ if page.status_code == 200: href = anchor.get('href', '/') if href != "#" and href != "http://www.clarissariviere.com/": print(href) + + +for i in range(1,100): + paging = i * 10 + page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i)) + soup = BeautifulSoup(page.text, 'html.parser') + if page.status_code == 200: + h2 = soup.find_all("h2") + for title in h2: + print(title.find_all("a")[0].get("href", "/")) + + + + + From e42ffd98ae4bc5ca0e3360f944d024a133ed0438 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Thu, 2 Mar 2023 23:28:04 +0100 Subject: [PATCH 04/11] scrap href all page from gouter --- web_scrap.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 5571050..8ea6977 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -5,25 +5,45 @@ from bs4 import BeautifulSoup import requests -page = requests.get("https://www.clarissariviere.com") +URL = "www.clarissariviere.com" +page = requests.get("https://{0}".format(URL)) + +page_url = [] if page.status_code == 200: soup = BeautifulSoup(page.text, 'html.parser') ul = soup.find_all("ul", id="listsmooth") for anchor in ul[0].find_all("a"): href = anchor.get('href', '/') - if href != "#" and href != "http://www.clarissariviere.com/": - print(href) + if href != "#": + page_url.append(href) - -for i in range(1,100): - paging = i * 10 - page = requests.get("https://www.clarissariviere.com/archives/p{0}-10.html".format(i)) - soup = BeautifulSoup(page.text, 'html.parser') +for i in page_url: + page = requests.get(i) if page.status_code == 200: - h2 = soup.find_all("h2") - for title in h2: - print(title.find_all("a")[0].get("href", "/")) + print("page : {0}".format(i)) + soup = BeautifulSoup(page.text, 'html.parser') + class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") + if len(class_div) > 0: + pagingfirstline = class_div[0].find_all("a") + if len(pagingfirstline) > 1: + lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") + element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] + number_page = element_lastpage.split("-")[0].split("p")[1] + number_lastpage = int(number_page) / 10 + for j in range(1,int(number_lastpage)): + paging = j * 10 + categorie = i.split("/") + url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging) + if len(categorie) != 4: + url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[4], paging) + print(url_paging) + page = requests.get(url_paging) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + h2 = soup.find_all("h2") + for title in h2: + print(title.find_all("a")[0].get("href", "/")) From d21af4f60aff64a6618408db8e1c2a711e1dadae Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Fri, 3 Mar 2023 20:03:48 +0100 Subject: [PATCH 05/11] add array unique of webpage --- web_scrap.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/web_scrap.py b/web_scrap.py index 8ea6977..a96d90f 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -18,6 +18,7 @@ if page.status_code == 200: if href != "#": page_url.append(href) +webpage = [] for i in page_url: page = requests.get(i) if page.status_code == 200: @@ -43,7 +44,11 @@ for i in page_url: soup = BeautifulSoup(page.text, 'html.parser') h2 = soup.find_all("h2") for title in h2: - print(title.find_all("a")[0].get("href", "/")) + href = title.find_all("a")[0].get("href", "/") + if href not in webpage: + webpage.append(href) + +print(webpage) From 3c76cab9a7566c46021846b7351bed829f1f5915 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 4 Mar 2023 16:12:42 +0100 Subject: [PATCH 06/11] add urlparse --- web_scrap.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index a96d90f..caf2537 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -3,6 +3,7 @@ # Python 3 # Extraction des liens d'une page web from bs4 import BeautifulSoup +from urllib.parse import urlparse import requests URL = "www.clarissariviere.com" @@ -21,6 +22,8 @@ if page.status_code == 200: webpage = [] for i in page_url: page = requests.get(i) + o = urlparse(i) + print(o.path) if page.status_code == 200: print("page : {0}".format(i)) soup = BeautifulSoup(page.text, 'html.parser') @@ -34,10 +37,10 @@ for i in page_url: number_lastpage = int(number_page) / 10 for j in range(1,int(number_lastpage)): paging = j * 10 - categorie = i.split("/") + categorie = urlparse(i).path.split("/") url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging) - if len(categorie) != 4: - url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[4], paging) + if len(categorie) > 2: + url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[2], paging) print(url_paging) page = requests.get(url_paging) if page.status_code == 200: From 6794f77df2da4e63dcabc8b75071c151286c38ff Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 4 Mar 2023 18:35:06 +0100 Subject: [PATCH 07/11] create dir for every path --- web_scrap.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index caf2537..c381606 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -1,10 +1,23 @@ #!/usr/bin/python3 - # Python 3 # Extraction des liens d'une page web from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests +import requests, os + +def mkdir_path(path_dir): + if not os.path.exists(path_dir): + makedir = [] + pathh = path_dir.split("/") + for i in pathh: + makedir.append(i) + repath = "/".join(makedir) + if not os.path.exists(repath): + os.mkdir(repath) + +BACKUP_DIR = "backup" + +mkdir_path(BACKUP_DIR) URL = "www.clarissariviere.com" @@ -22,8 +35,6 @@ if page.status_code == 200: webpage = [] for i in page_url: page = requests.get(i) - o = urlparse(i) - print(o.path) if page.status_code == 200: print("page : {0}".format(i)) soup = BeautifulSoup(page.text, 'html.parser') @@ -51,9 +62,9 @@ for i in page_url: if href not in webpage: webpage.append(href) -print(webpage) - - - - - +for i in webpage: + o = urlparse(i) + path_web = o.path.split("/") + path_web.pop(len(path_web)-1) + dir_page_web = "/".join(path_web) + mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web)) \ No newline at end of file From 4de811c607b86f9a7a09a847714cefd0ff6dd684 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sat, 4 Mar 2023 18:45:32 +0100 Subject: [PATCH 08/11] fix placement variable --- web_scrap.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index c381606..c43c2a5 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -5,6 +5,10 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse import requests, os +BACKUP_DIR = "backup" +URL = "www.clarissariviere.com" + + def mkdir_path(path_dir): if not os.path.exists(path_dir): makedir = [] @@ -15,11 +19,8 @@ def mkdir_path(path_dir): if not os.path.exists(repath): os.mkdir(repath) -BACKUP_DIR = "backup" - mkdir_path(BACKUP_DIR) -URL = "www.clarissariviere.com" page = requests.get("https://{0}".format(URL)) From a3aceccba70fca34c2a3bd42bc5b262a172f5f98 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 5 Mar 2023 20:12:58 +0100 Subject: [PATCH 09/11] create function for every task --- web_scrap.py | 106 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index c43c2a5..18e38c5 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -19,53 +19,71 @@ def mkdir_path(path_dir): if not os.path.exists(repath): os.mkdir(repath) -mkdir_path(BACKUP_DIR) -page = requests.get("https://{0}".format(URL)) -page_url = [] -if page.status_code == 200: - soup = BeautifulSoup(page.text, 'html.parser') - ul = soup.find_all("ul", id="listsmooth") - for anchor in ul[0].find_all("a"): - href = anchor.get('href', '/') - if href != "#": - page_url.append(href) - -webpage = [] -for i in page_url: - page = requests.get(i) +def getUrlPage(url): + print(url) + page = requests.get(url) + page_url = [] if page.status_code == 200: - print("page : {0}".format(i)) soup = BeautifulSoup(page.text, 'html.parser') - class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") - if len(class_div) > 0: - pagingfirstline = class_div[0].find_all("a") - if len(pagingfirstline) > 1: - lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") - element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] - number_page = element_lastpage.split("-")[0].split("p")[1] - number_lastpage = int(number_page) / 10 - for j in range(1,int(number_lastpage)): - paging = j * 10 - categorie = urlparse(i).path.split("/") - url_paging = "https://{0}/archives/p{1}-10.html".format(URL, paging) - if len(categorie) > 2: - url_paging = "https://{0}/archives/{1}/p{2}-10.html".format(URL, categorie[2], paging) - print(url_paging) - page = requests.get(url_paging) - if page.status_code == 200: - soup = BeautifulSoup(page.text, 'html.parser') - h2 = soup.find_all("h2") - for title in h2: - href = title.find_all("a")[0].get("href", "/") - if href not in webpage: - webpage.append(href) + ul = soup.find_all("ul", id="listsmooth") + for anchor in ul[0].find_all("a"): + href = anchor.get('href', '/') + if href != "#": + page_url.append(href) -for i in webpage: - o = urlparse(i) - path_web = o.path.split("/") - path_web.pop(len(path_web)-1) - dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web)) \ No newline at end of file + webpage = [] + for i in page_url: + page = requests.get(i) + if page.status_code == 200: + print("page : {0}".format(i)) + if i not in webpage: + webpage.append(i) + soup = BeautifulSoup(page.text, 'html.parser') + class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") + if len(class_div) > 0: + pagingfirstline = class_div[0].find_all("a") + if len(pagingfirstline) > 1: + lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") + element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] + number_page = element_lastpage.split("-")[0].split("p")[1] + number_lastpage = int(number_page) / 10 + for j in range(1,int(number_lastpage)): + paging = j * 10 + categorie = urlparse(i).path.split("/") + url_paging = "{0}/archives/p{1}-10.html".format(url, paging) + if len(categorie) > 2: + url_paging = "{0}/archives/{1}/p{2}-10.html".format(url, categorie[2], paging) + print(url_paging) + if url_paging not in webpage: + webpage.append(url_paging) + page = requests.get(url_paging) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + h2 = soup.find_all("h2") + for title in h2: + href = title.find_all("a")[0].get("href", "/") + if href not in webpage: + o = urlparse(href) + o = o._replace(scheme="https").geturl() + webpage.append(o) + return webpage + + +def downloadPage(url): + o = urlparse(url) + o = o._replace(scheme="https") + o = o._replace(fragment="") + webpage = getUrlPage(o.geturl().replace(":///", "://")) + for i in webpage: + o = urlparse(i) + path_web = o.path.split("/") + path_web.pop(len(path_web)-1) + dir_page_web = "/".join(path_web) + mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web)) + + +if __name__ == '__main__': + downloadPage(URL) \ No newline at end of file From c7dc2d626f5995ad8470b51a4579c861d3c6323e Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Sun, 5 Mar 2023 21:44:30 +0100 Subject: [PATCH 10/11] Download file html --- web_scrap.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 18e38c5..fb8fc32 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -1,13 +1,7 @@ #!/usr/bin/python3 -# Python 3 -# Extraction des liens d'une page web from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests, os - -BACKUP_DIR = "backup" -URL = "www.clarissariviere.com" - +import requests, os, argparse def mkdir_path(path_dir): if not os.path.exists(path_dir): @@ -20,10 +14,7 @@ def mkdir_path(path_dir): os.mkdir(repath) - - def getUrlPage(url): - print(url) page = requests.get(url) page_url = [] if page.status_code == 200: @@ -69,21 +60,35 @@ def getUrlPage(url): o = urlparse(href) o = o._replace(scheme="https").geturl() webpage.append(o) - return webpage + return webpage -def downloadPage(url): +def downloadPage(url, backup_dir): o = urlparse(url) o = o._replace(scheme="https") - o = o._replace(fragment="") webpage = getUrlPage(o.geturl().replace(":///", "://")) - for i in webpage: - o = urlparse(i) + for i in range(0, len(webpage)): + o = urlparse(webpage[i]) path_web = o.path.split("/") + filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(BACKUP_DIR, dir_page_web)) + mkdir_path("{0}/{1}".format(backup_dir, dir_page_web)) + r = requests.get(webpage[i]) + if r.status_code == 200: + fileDownload = "{0}/index.html".format(backup_dir) + if len(dir_page_web) > 0 and len(filePageWeb) > 0: + fileDownload = "{0}/{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) + print("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + open(fileDownload, "wb").write(r.content) if __name__ == '__main__': - downloadPage(URL) \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--url", help="canblog URL to be scraping", required=True) + parser.add_argument("--dir", + default="backup", + help="backup file path") + parser.add_argument("--verbosity", help="Verbosity", action="store_false") + args = parser.parse_args() + downloadPage(args.url, args.dir) \ No newline at end of file From 3ccebbac3619055f8e5d0e42a69882e7f484de19 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 6 Mar 2023 22:54:32 +0100 Subject: [PATCH 11/11] logger web_scrap --- web_scrap.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index fb8fc32..d08dfbd 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -1,9 +1,9 @@ #!/usr/bin/python3 from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests, os, argparse +import requests, os, argparse, logging -def mkdir_path(path_dir): +def mkdir_path(path_dir, logger): if not os.path.exists(path_dir): makedir = [] pathh = path_dir.split("/") @@ -11,10 +11,11 @@ def mkdir_path(path_dir): makedir.append(i) repath = "/".join(makedir) if not os.path.exists(repath): + logger.debug("Dossier crée : {0}".format(repath)) os.mkdir(repath) -def getUrlPage(url): +def getUrlPage(url, logger): page = requests.get(url) page_url = [] if page.status_code == 200: @@ -29,7 +30,7 @@ def getUrlPage(url): for i in page_url: page = requests.get(i) if page.status_code == 200: - print("page : {0}".format(i)) + logger.info("page : {0}".format(i)) if i not in webpage: webpage.append(i) soup = BeautifulSoup(page.text, 'html.parser') @@ -47,7 +48,7 @@ def getUrlPage(url): url_paging = "{0}/archives/p{1}-10.html".format(url, paging) if len(categorie) > 2: url_paging = "{0}/archives/{1}/p{2}-10.html".format(url, categorie[2], paging) - print(url_paging) + logger.info(url_paging) if url_paging not in webpage: webpage.append(url_paging) page = requests.get(url_paging) @@ -63,23 +64,23 @@ def getUrlPage(url): return webpage -def downloadPage(url, backup_dir): +def downloadPage(url, backup_dir, logger): o = urlparse(url) o = o._replace(scheme="https") - webpage = getUrlPage(o.geturl().replace(":///", "://")) + webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) for i in range(0, len(webpage)): o = urlparse(webpage[i]) path_web = o.path.split("/") filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(backup_dir, dir_page_web)) + mkdir_path("{0}/{1}".format(backup_dir, dir_page_web), logger) r = requests.get(webpage[i]) if r.status_code == 200: fileDownload = "{0}/index.html".format(backup_dir) if len(dir_page_web) > 0 and len(filePageWeb) > 0: fileDownload = "{0}/{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) - print("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + logger.info("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) @@ -89,6 +90,20 @@ if __name__ == '__main__': parser.add_argument("--dir", default="backup", help="backup file path") - parser.add_argument("--verbosity", help="Verbosity", action="store_false") + parser.add_argument("--debug", help="Verbosity", action="store_true") args = parser.parse_args() - downloadPage(args.url, args.dir) \ No newline at end of file + logger = logging.getLogger('web_scrap') + ch = logging.StreamHandler() + + if args.debug is not None: + logger.setLevel(logging.DEBUG) + ch.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + ch.setLevel(logging.INFO) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + + downloadPage(args.url, args.dir, logger) \ No newline at end of file