From 559f5f1e8323cc36d186c8c6dcb2e0c0332b1ca8 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 7 Mar 2023 21:54:18 +0100 Subject: [PATCH 1/7] re-organisation fonction et renommage --- web_scrap.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index d08dfbd..949a884 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse import requests, os, argparse, logging -def mkdir_path(path_dir, logger): +def mkdirPath(path_dir, logger): if not os.path.exists(path_dir): makedir = [] pathh = path_dir.split("/") @@ -64,22 +64,20 @@ def getUrlPage(url, logger): return webpage -def downloadPage(url, backup_dir, logger): - o = urlparse(url) - o = o._replace(scheme="https") - webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) +def downloadPageHTML(webpage, backup_dir, logger): + for i in range(0, len(webpage)): o = urlparse(webpage[i]) path_web = o.path.split("/") filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(backup_dir, dir_page_web), logger) + mkdirPath("{0}/{1}".format(backup_dir, dir_page_web), logger) r = requests.get(webpage[i]) if r.status_code == 200: fileDownload = "{0}/index.html".format(backup_dir) if len(dir_page_web) > 0 and len(filePageWeb) > 0: - fileDownload = "{0}/{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) + fileDownload = "{0}{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) logger.info("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) @@ -95,7 +93,7 @@ if __name__ == '__main__': logger = logging.getLogger('web_scrap') ch = logging.StreamHandler() - if args.debug is not None: + if args.debug is True: logger.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG) else: @@ -106,4 +104,8 @@ if __name__ == '__main__': ch.setFormatter(formatter) logger.addHandler(ch) - downloadPage(args.url, args.dir, logger) \ No newline at end of file + o = urlparse(args.url) + o = o._replace(scheme="https") + webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) + + downloadPageHTML(webpage, args.dir, logger) \ No newline at end of file From 1a67ab7dbf42382d4b900d27c20164f508addf41 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 7 Mar 2023 22:42:05 +0100 Subject: [PATCH 2/7] download script js --- web_scrap.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 949a884..9eada68 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -15,6 +15,25 @@ def mkdirPath(path_dir, logger): os.mkdir(repath) +def getScriptCss(url, js, css, logger): + page = requests.get(url) + page_url = [] + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + if js is True: + script = soup.find_all("script") + for anchor in script: + src = anchor.get("src", "/") + if src != "/": + u = urlparse(url) + o = urlparse(src) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + + return page_url + def getUrlPage(url, logger): page = requests.get(url) page_url = [] @@ -64,7 +83,7 @@ def getUrlPage(url, logger): return webpage -def downloadPageHTML(webpage, backup_dir, logger): +def downloadPage(webpage, backup_dir, logger): for i in range(0, len(webpage)): o = urlparse(webpage[i]) @@ -106,6 +125,9 @@ if __name__ == '__main__': o = urlparse(args.url) o = o._replace(scheme="https") - webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) + url = o.geturl().replace(":///", "://") + script = getScriptCss(url, True, False, logger) + logger.info(script) + #webpage = getUrlPage(url, logger) - downloadPageHTML(webpage, args.dir, logger) \ No newline at end of file + downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file From 06599d99faf7e4cc960b054a4005c2af8b1ff8c1 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 7 Mar 2023 22:50:40 +0100 Subject: [PATCH 3/7] download css --- web_scrap.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 9eada68..e03fb1e 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -31,6 +31,20 @@ def getScriptCss(url, js, css, logger): o = o._replace(netloc=u.netloc) o = o._replace(scheme=u.scheme) page_url.append(o.geturl()) + if css is True: + link = soup.find_all("link") + for anchor in link: + rel = anchor.get("rel") + if rel[0] == "stylesheet": + href = anchor.get("href", "/") + if href != "/": + u = urlparse(url) + o = urlparse(href) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + return page_url @@ -97,7 +111,7 @@ def downloadPage(webpage, backup_dir, logger): fileDownload = "{0}/index.html".format(backup_dir) if len(dir_page_web) > 0 and len(filePageWeb) > 0: fileDownload = "{0}{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) - logger.info("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) @@ -127,7 +141,9 @@ if __name__ == '__main__': o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") script = getScriptCss(url, True, False, logger) - logger.info(script) + downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) + css = getScriptCss(url, False, True, logger) + downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) #webpage = getUrlPage(url, logger) - downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file + #downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file From 896cfa0d52817d983205fd73e7730845066485e0 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Tue, 7 Mar 2023 22:53:33 +0100 Subject: [PATCH 4/7] remove comment useless --- web_scrap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index e03fb1e..e3b57a3 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -144,6 +144,6 @@ if __name__ == '__main__': downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) css = getScriptCss(url, False, True, logger) downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) - #webpage = getUrlPage(url, logger) + webpage = getUrlPage(url, logger) - #downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file + downloadPage(script, args.dir, logger) \ No newline at end of file From 21d24d638ded4e558435f816b640b34895386b8a Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Wed, 8 Mar 2023 22:01:11 +0100 Subject: [PATCH 5/7] add argument --- web_scrap.py | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index e3b57a3..0593fb5 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -48,6 +48,24 @@ def getScriptCss(url, js, css, logger): return page_url +def getImg(webpage, logger): + page_img = [] + for i in webpage: + page = requests.get(i) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + img = soup.find_all("img") + logger.info("image from page: {0} : ".format(i)) + for anchor in img: + src = anchor.get("src", "/") + if src != "/": + if src not in page_img: + logger.info("image: {0} : ".format(src)) + page_img.append(src) + + + return page_img + def getUrlPage(url, logger): page = requests.get(url) page_url = [] @@ -122,6 +140,11 @@ if __name__ == '__main__': default="backup", help="backup file path") parser.add_argument("--debug", help="Verbosity", action="store_true") + parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") + parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") + parser.add_argument("--no-img", help="No img", dest="img", action="store_true") + parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") + args = parser.parse_args() logger = logging.getLogger('web_scrap') ch = logging.StreamHandler() @@ -140,10 +163,21 @@ if __name__ == '__main__': o = urlparse(args.url) o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") - script = getScriptCss(url, True, False, logger) - downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) - css = getScriptCss(url, False, True, logger) - downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) - webpage = getUrlPage(url, logger) + if args.js is False: + script = getScriptCss(url, True, False, logger) + downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) + + if args.css is False: + css = getScriptCss(url, False, True, logger) + downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) + + if args.html is False or args.img is False: + webpage = getUrlPage(url, logger) + if args.html is False: + downloadPage(webpage, args.dir, logger) + + if args.img is False: + page_src = getImg(webpage, logger) + downloadPage(page_src, "{0}/{1}".format(args.dir, "img"), logger) - downloadPage(script, args.dir, logger) \ No newline at end of file + # \ No newline at end of file From 77e61ef571344f5a4cb1b6a8250afbfa414400a6 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Wed, 8 Mar 2023 22:05:25 +0100 Subject: [PATCH 6/7] fix path --- web_scrap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 0593fb5..2465d90 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -123,12 +123,12 @@ def downloadPage(webpage, backup_dir, logger): filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdirPath("{0}/{1}".format(backup_dir, dir_page_web), logger) + mkdirPath("{0}/{1}/{2}".format(backup_dir, o.netloc, dir_page_web), logger) r = requests.get(webpage[i]) if r.status_code == 200: - fileDownload = "{0}/index.html".format(backup_dir) + fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc) if len(dir_page_web) > 0 and len(filePageWeb) > 0: - fileDownload = "{0}{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) + fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb) logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) From 4d073e0254800400723158c237224d9a5c30a564 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Wed, 8 Mar 2023 22:41:35 +0100 Subject: [PATCH 7/7] fix path with url --- web_scrap.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/web_scrap.py b/web_scrap.py index 2465d90..d11f7f3 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -165,11 +165,11 @@ if __name__ == '__main__': url = o.geturl().replace(":///", "://") if args.js is False: script = getScriptCss(url, True, False, logger) - downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) + downloadPage(script, "{0}/{1}/{2}".format(args.dir, o.path, "dists/js"), logger) if args.css is False: css = getScriptCss(url, False, True, logger) - downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) + downloadPage(css, "{0}/{1}/{2}".format(args.dir, o.path, "dists/css"), logger) if args.html is False or args.img is False: webpage = getUrlPage(url, logger) @@ -178,6 +178,4 @@ if __name__ == '__main__': if args.img is False: page_src = getImg(webpage, logger) - downloadPage(page_src, "{0}/{1}".format(args.dir, "img"), logger) - - # \ No newline at end of file + downloadPage(page_src, "{0}/{1}/{2}".format(args.dir, o.path, "img"), logger) \ No newline at end of file