diff --git a/web_scrap.py b/web_scrap.py index d08dfbd..d11f7f3 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse import requests, os, argparse, logging -def mkdir_path(path_dir, logger): +def mkdirPath(path_dir, logger): if not os.path.exists(path_dir): makedir = [] pathh = path_dir.split("/") @@ -15,6 +15,57 @@ def mkdir_path(path_dir, logger): os.mkdir(repath) +def getScriptCss(url, js, css, logger): + page = requests.get(url) + page_url = [] + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + if js is True: + script = soup.find_all("script") + for anchor in script: + src = anchor.get("src", "/") + if src != "/": + u = urlparse(url) + o = urlparse(src) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + if css is True: + link = soup.find_all("link") + for anchor in link: + rel = anchor.get("rel") + if rel[0] == "stylesheet": + href = anchor.get("href", "/") + if href != "/": + u = urlparse(url) + o = urlparse(href) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + + + return page_url + +def getImg(webpage, logger): + page_img = [] + for i in webpage: + page = requests.get(i) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + img = soup.find_all("img") + logger.info("image from page: {0} : ".format(i)) + for anchor in img: + src = anchor.get("src", "/") + if src != "/": + if src not in page_img: + logger.info("image: {0} : ".format(src)) + page_img.append(src) + + + return page_img + def getUrlPage(url, logger): page = requests.get(url) page_url = [] @@ -64,23 +115,21 @@ def getUrlPage(url, logger): return webpage -def downloadPage(url, backup_dir, logger): - o = urlparse(url) - o = o._replace(scheme="https") - webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) +def downloadPage(webpage, backup_dir, logger): + for i in range(0, len(webpage)): o = urlparse(webpage[i]) path_web = o.path.split("/") filePageWeb = path_web[len(path_web)-1] path_web.pop(len(path_web)-1) dir_page_web = "/".join(path_web) - mkdir_path("{0}/{1}".format(backup_dir, dir_page_web), logger) + mkdirPath("{0}/{1}/{2}".format(backup_dir, o.netloc, dir_page_web), logger) r = requests.get(webpage[i]) if r.status_code == 200: - fileDownload = "{0}/index.html".format(backup_dir) + fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc) if len(dir_page_web) > 0 and len(filePageWeb) > 0: - fileDownload = "{0}/{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) - logger.info("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb) + logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) @@ -91,11 +140,16 @@ if __name__ == '__main__': default="backup", help="backup file path") parser.add_argument("--debug", help="Verbosity", action="store_true") + parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") + parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") + parser.add_argument("--no-img", help="No img", dest="img", action="store_true") + parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") + args = parser.parse_args() logger = logging.getLogger('web_scrap') ch = logging.StreamHandler() - if args.debug is not None: + if args.debug is True: logger.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG) else: @@ -106,4 +160,22 @@ if __name__ == '__main__': ch.setFormatter(formatter) logger.addHandler(ch) - downloadPage(args.url, args.dir, logger) \ No newline at end of file + o = urlparse(args.url) + o = o._replace(scheme="https") + url = o.geturl().replace(":///", "://") + if args.js is False: + script = getScriptCss(url, True, False, logger) + downloadPage(script, "{0}/{1}/{2}".format(args.dir, o.path, "dists/js"), logger) + + if args.css is False: + css = getScriptCss(url, False, True, logger) + downloadPage(css, "{0}/{1}/{2}".format(args.dir, o.path, "dists/css"), logger) + + if args.html is False or args.img is False: + webpage = getUrlPage(url, logger) + if args.html is False: + downloadPage(webpage, args.dir, logger) + + if args.img is False: + page_src = getImg(webpage, logger) + downloadPage(page_src, "{0}/{1}/{2}".format(args.dir, o.path, "img"), logger) \ No newline at end of file