diff --git a/web_scrap.py b/web_scrap.py index e3b57a3..0593fb5 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -48,6 +48,24 @@ def getScriptCss(url, js, css, logger): return page_url +def getImg(webpage, logger): + page_img = [] + for i in webpage: + page = requests.get(i) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + img = soup.find_all("img") + logger.info("image from page: {0} : ".format(i)) + for anchor in img: + src = anchor.get("src", "/") + if src != "/": + if src not in page_img: + logger.info("image: {0} : ".format(src)) + page_img.append(src) + + + return page_img + def getUrlPage(url, logger): page = requests.get(url) page_url = [] @@ -122,6 +140,11 @@ if __name__ == '__main__': default="backup", help="backup file path") parser.add_argument("--debug", help="Verbosity", action="store_true") + parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true") + parser.add_argument("--no-js", help="No JS", dest="js", action="store_true") + parser.add_argument("--no-img", help="No img", dest="img", action="store_true") + parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") + args = parser.parse_args() logger = logging.getLogger('web_scrap') ch = logging.StreamHandler() @@ -140,10 +163,21 @@ if __name__ == '__main__': o = urlparse(args.url) o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") - script = getScriptCss(url, True, False, logger) - downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) - css = getScriptCss(url, False, True, logger) - downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) - webpage = getUrlPage(url, logger) + if args.js is False: + script = getScriptCss(url, True, False, logger) + downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) + + if args.css is False: + css = getScriptCss(url, False, True, logger) + downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) + + if args.html is False or args.img is False: + webpage = getUrlPage(url, logger) + if args.html is False: + downloadPage(webpage, args.dir, logger) + + if args.img is False: + page_src = getImg(webpage, logger) + downloadPage(page_src, "{0}/{1}".format(args.dir, "img"), logger) - downloadPage(script, args.dir, logger) \ No newline at end of file + # \ No newline at end of file