diff --git a/web_scrap.py b/web_scrap.py index 9eada68..e03fb1e 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -31,6 +31,20 @@ def getScriptCss(url, js, css, logger): o = o._replace(netloc=u.netloc) o = o._replace(scheme=u.scheme) page_url.append(o.geturl()) + if css is True: + link = soup.find_all("link") + for anchor in link: + rel = anchor.get("rel") + if rel[0] == "stylesheet": + href = anchor.get("href", "/") + if href != "/": + u = urlparse(url) + o = urlparse(href) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + return page_url @@ -97,7 +111,7 @@ def downloadPage(webpage, backup_dir, logger): fileDownload = "{0}/index.html".format(backup_dir) if len(dir_page_web) > 0 and len(filePageWeb) > 0: fileDownload = "{0}{1}/{2}".format(backup_dir, dir_page_web, filePageWeb) - logger.info("{0}/{1} : {2}".format(i, len(webpage), fileDownload)) + logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) open(fileDownload, "wb").write(r.content) @@ -127,7 +141,9 @@ if __name__ == '__main__': o = o._replace(scheme="https") url = o.geturl().replace(":///", "://") script = getScriptCss(url, True, False, logger) - logger.info(script) + downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) + css = getScriptCss(url, False, True, logger) + downloadPage(css, "{0}/{1}".format(args.dir, "dists/css"), logger) #webpage = getUrlPage(url, logger) - downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file + #downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file