diff --git a/web_scrap.py b/web_scrap.py index 949a884..9eada68 100644 --- a/web_scrap.py +++ b/web_scrap.py @@ -15,6 +15,25 @@ def mkdirPath(path_dir, logger): os.mkdir(repath) +def getScriptCss(url, js, css, logger): + page = requests.get(url) + page_url = [] + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'html.parser') + if js is True: + script = soup.find_all("script") + for anchor in script: + src = anchor.get("src", "/") + if src != "/": + u = urlparse(url) + o = urlparse(src) + if o.netloc == "": + o = o._replace(netloc=u.netloc) + o = o._replace(scheme=u.scheme) + page_url.append(o.geturl()) + + return page_url + def getUrlPage(url, logger): page = requests.get(url) page_url = [] @@ -64,7 +83,7 @@ def getUrlPage(url, logger): return webpage -def downloadPageHTML(webpage, backup_dir, logger): +def downloadPage(webpage, backup_dir, logger): for i in range(0, len(webpage)): o = urlparse(webpage[i]) @@ -106,6 +125,9 @@ if __name__ == '__main__': o = urlparse(args.url) o = o._replace(scheme="https") - webpage = getUrlPage(o.geturl().replace(":///", "://"), logger) + url = o.geturl().replace(":///", "://") + script = getScriptCss(url, True, False, logger) + logger.info(script) + #webpage = getUrlPage(url, logger) - downloadPageHTML(webpage, args.dir, logger) \ No newline at end of file + downloadPage(script, "{0}/{1}".format(args.dir, "dists/js"), logger) \ No newline at end of file