92 Commits

Author SHA1 Message Date
ab3720fbbc fix directory in thread 2023-04-29 22:26:47 +02:00
7a1286c4e2 add thread for directory import 2023-04-28 23:37:13 +02:00
5a4bdbb420 add name thread in message logger 2023-04-28 23:14:57 +02:00
bf4c2480f8 import threading for directory WIP 2023-04-27 00:00:53 +02:00
a0b816fe18 add debug thread 2023-04-26 23:03:43 +02:00
08ff16527d fix thread in parallelism 2023-04-25 16:15:45 +02:00
0acd5067cb thread 50% 2023-04-25 00:34:25 +02:00
aaac2385a3 fix previos commit 2023-04-24 23:16:53 +02:00
88f258ffba Add parallelism 2023-04-24 23:15:29 +02:00
a39e2200bd add function 2023-04-22 00:07:54 +02:00
5a5658d955 Merge pull request 'parent-comment' (#8) from parent-comment into master
Reviewed-on: #8
2023-04-20 19:30:45 +00:00
4e6ae92217 add message error and debug for export 2023-04-20 20:53:50 +02:00
34d6cc39d2 add debug message for error request 2023-04-20 20:48:37 +02:00
c44ffc5a86 double comment 2023-04-20 00:08:56 +02:00
ca39826a11 fix comment parent 75% 2023-04-19 23:53:11 +02:00
f8d103ff61 fix add comment 2023-04-19 23:16:39 +02:00
1c252c9a14 replace post by delete 2023-04-19 22:21:15 +02:00
84cc204007 comment update/add in fixing 2023-04-18 22:01:44 +02:00
edb9442b1c add search tags and categories before create tags and categories 2023-04-18 21:50:36 +02:00
d64aed6240 update error message + add debug 2023-04-18 00:00:32 +02:00
a5e7cb89f7 add error status code 2023-04-17 23:44:09 +02:00
ae7cb1e4e0 remove exit useless 2023-04-16 21:26:48 +02:00
4cf301b216 parent comment 90% 2023-04-16 21:25:32 +02:00
581b6941a6 parent id 75% 2023-04-16 21:06:04 +02:00
bd8ac241c1 debug level comment 2023-04-16 19:32:00 +02:00
0e15e88f31 Get level comment 50% 2023-04-16 19:16:23 +02:00
b54785c455 add parent comment WIP 2023-04-14 23:10:07 +02:00
1600a17383 Merge pull request 'retries' (#7) from retries into master
Reviewed-on: #7
2023-04-13 20:16:35 +00:00
74e7f1d74b add try/except for request 2023-04-13 22:14:30 +02:00
225c7ecabb add backoff factor 2023-04-13 21:59:12 +02:00
1311ef2ff2 add retry 2023-04-13 21:54:35 +02:00
f5e82fe4c4 Merge pull request 'insert' (#6) from insert into master
Reviewed-on: #6
2023-04-11 21:29:53 +00:00
76d2771886 remove newline useless 2023-04-11 23:27:41 +02:00
335266e1ad update comment 2023-04-11 23:26:40 +02:00
a856311f04 add method for comment 2023-04-11 22:30:00 +02:00
7848968fa1 Organisation class in a folder 2023-04-11 22:15:36 +02:00
05a3a28c6f add serial for url 2023-04-10 16:36:49 +02:00
7c75116c5b add url list 2023-04-10 16:15:13 +02:00
48e77084e8 remove newline 2023-04-10 16:07:14 +02:00
aa5c8893ec loop for url 2023-04-10 16:02:40 +02:00
4ddc4a7cd3 rm web_scrap + add set url + add backup1 to gitignore 2023-04-10 15:41:14 +02:00
ed78f22f2e fix WPImport from URL 2023-04-10 11:05:32 +02:00
e74dfc2b73 add import from url 2023-04-10 00:00:01 +02:00
cd50e45493 fix WPExport 2023-04-09 23:49:10 +02:00
19c62f38d4 add exit 2023-04-09 22:50:41 +02:00
bba6cd1ca7 add export canalblog 2023-04-09 22:49:44 +02:00
9ed08ea964 add parameter parser 2023-04-09 21:45:51 +02:00
cd6b03b0ff add parameter parser 2023-04-09 21:17:49 +02:00
7e484fa308 add args command name 2023-04-08 23:43:06 +02:00
ebc6206ec9 add subparser export 2023-04-08 23:34:56 +02:00
b3f623cbd5 subparser import 2023-04-08 23:20:52 +02:00
8384dcb2b6 create class WPExport 2023-04-08 22:14:20 +02:00
2289066dd5 rename main file 2023-04-08 21:44:52 +02:00
481fc40929 separate class file for WPimport 2023-04-08 21:27:35 +02:00
6f7504e669 separate file class 2023-04-08 12:27:30 +02:00
d58ead52b2 replace print by logger 2023-04-08 12:17:43 +02:00
9ab33c169e add directory parameter 2023-04-07 22:55:27 +02:00
34115a3a7d recursive functions for directories 2023-04-07 22:38:34 +02:00
1f7e442d04 wip directory 2023-04-06 21:53:56 +02:00
5768b37cd1 ajout de verification de existence fichier 2023-04-06 20:59:11 +02:00
ba511bc6c4 Ajout print image ajoute 2023-04-04 22:14:10 +02:00
665f1474f2 delete and replace image 2023-04-04 22:07:36 +02:00
404ad5dd6c update image not fix 2023-04-04 00:00:28 +02:00
501876dac2 add or update featured media 2023-04-03 23:45:48 +02:00
c9b1264153 remove private method for featured media 2023-04-02 18:01:57 +02:00
f77274f00e add headers json 2023-04-02 17:56:22 +02:00
1e162662e6 add featured media 2023-04-02 17:51:54 +02:00
ec4135c5d0 fix condition type file 2023-04-02 17:36:17 +02:00
cb64dd47ab create private method for add or update media 2023-04-02 17:34:55 +02:00
42b7e7e408 get featured image for post wip 2023-04-02 16:56:07 +02:00
cc33ab34df private variable 2023-04-02 13:14:52 +02:00
f07f8c040f add private method for association id 2023-04-02 13:06:10 +02:00
4054f41e9b add json dumps for post 2023-04-01 18:32:17 +02:00
faa22f1438 update post 2023-04-01 00:18:56 +02:00
bcb3abce01 fix variable 2023-04-01 00:11:33 +02:00
e0b4895b62 association image article 2023-03-31 00:14:38 +02:00
066d8cae52 remove print 2023-03-31 00:06:47 +02:00
90881eb037 add media in body html 2023-03-31 00:05:11 +02:00
c92f24e6af check image exist in media 2023-03-30 23:50:25 +02:00
301f1e2d4b add img successful in media 2023-03-30 23:29:29 +02:00
e1b0c0cba8 img wip 2023-03-29 22:59:15 +02:00
f250637912 add class + wordpress 2023-03-29 22:31:35 +02:00
19229bc65b add .gitgnore + add function 2023-03-28 22:29:55 +02:00
d96d38e508 add author name 2023-03-28 16:43:56 +02:00
dc0fd0c781 insert comment 75% 2023-03-28 16:40:15 +02:00
e3b9e92c23 fix add author 2023-03-28 15:37:48 +02:00
82ce3d1a2b add author for article 2023-03-28 15:28:34 +02:00
605bd06e51 fix space 2023-03-28 12:07:11 +02:00
491f15ae3c premier essai d'insertion d'article reussi 2023-03-28 11:31:25 +02:00
0c41dc3e65 distinct tags and categories 2023-03-27 23:51:51 +02:00
3622e37942 add tags 2023-03-23 23:49:42 +01:00
eae95d5671 add script insert wip 2023-03-23 23:28:57 +01:00
6 changed files with 1007 additions and 241 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
backup/
backup1/
backup2/
web_scrap.log
__pycache__/

202
import_export_canalblog.py Normal file
View File

@@ -0,0 +1,202 @@
#!/usr/bin/python3
from requests.auth import HTTPBasicAuth
from getpass import getpass
from urllib.parse import urlparse
from concurrent import futures
import argparse, logging, threading
from lib.WPImport import WPimport
from lib.WPExport import WPExport
def download(name_thread, max_thread, url, logger, parser, directory, html, img):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
webpage = exportWp.getUrlPage(name_thread, max_thread)
if html is False:
exportWp.downloadHTML(webpage)
if args.img is False:
exportWp.downloadImg(webpage)
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromUrl(webpage)
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial):
name = "Thread-{0}".format(int(name_thread) + 1)
directory = args.directory.split(",")
if serial is False:
for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser)
for j in directory:
importWp.fromDirectory(j, name_thread, max_thread)
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromDirectory(directory[i])
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
subparsers = parser.add_subparsers(dest="command")
import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="")
import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
import_parser.add_argument("--serial", help="Serial execution", action="store_true")
export_parser = subparsers.add_parser("export")
export_parser.add_argument("--url", help="canblog URL to be scraping", required=True)
export_parser.add_argument("--directory",
default="backup",
help="backup file path")
export_parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
export_parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
args = parser.parse_args()
logger = logging.getLogger('import export canalblog')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.quiet is False:
ch = logging.StreamHandler()
if args.debug is True:
logger.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
if len(args.logfile) > 0:
fileHandler = logging.FileHandler(args.logfile)
if args.debug is True:
fileHandler.setLevel(logging.DEBUG)
else:
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)
if args.command == "import":
password = getpass()
if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
basic = HTTPBasicAuth(args.user, password)
wordpress = args.wordpress.split(",")
importWp = WPimport(basic, "", logger, args.parser)
if len(args.file) > 0:
for i in wordpress:
importWp.setUrl(i)
importWp.fromFile(files=args.file.split(","))
exit(0)
if len(args.directory) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)
if len(args.canalblog) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
if args.command == "export":
canalblog = args.url.split(",")
exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory)
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
if args.js is False:
exportWp.downloadJs()
if args.css is False:
exportWp.downloadCss()
if args.html is False or args.img is False:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img)
for i in range(0, int(args.parallel))
]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)

269
lib/WPExport.py Normal file
View File

@@ -0,0 +1,269 @@
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, argparse, logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPExport:
def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup"):
self._url = url
self._logger = logger
self._parser = parser
self._dir = directory
self._name = name
self._request = requests.Session()
retries = Retry(total=10,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Public method
# Set name
def setName(self, name):
self._name = "Thread-{0}".format(int(name) + 1)
# Set URL
def setUrl(self, url):
self._url = url
# Download JS
def downloadJs(self):
script = self._getScriptCss(True, False)
o = urlparse(self._url)
self._downloadPage(script, "{0}/{1}/{2}".format(self._dir, o.path, "dists/js"))
# Download CSS
def downloadCss(self):
css = self._getScriptCss(False, True)
o = urlparse(self._url)
self._downloadPage(css, "{0}/{1}/{2}".format(self._dir, o.path, "dists/css"))
# Download HTML
def downloadHTML(self, webpage):
self._downloadPage(webpage, self._dir)
# Download Image
def downloadImg(self, webpage):
page_src = self._getImg(webpage)
o = urlparse(self._url)
self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img"))
# Get URL
def getUrlPage(self, index_thread, max_thread):
try:
page = self._request.get(self._url)
except Exception as err:
self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1)
page_url = []
if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser)
ul = soup.find_all("ul", id="listsmooth")
for anchor in ul[0].find_all("a"):
href = anchor.get('href', '/')
if href != "#":
page_url.append(href)
else:
self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
webpage = []
for i in page_url:
try:
page = self._request.get(i)
except Exception as err:
self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
self._logger.info("{0} : page : {1}".format(self._name, i))
if i not in webpage:
webpage.append(i)
soup = BeautifulSoup(page.text, self._parser)
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
if len(class_div) > 0:
pagingfirstline = class_div[0].find_all("a")
if len(pagingfirstline) > 1:
lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/")
element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
number_page = element_lastpage.split("-")[0].split("p")[1]
number_lastpage = int(number_page) / 10
setPageDivided = int(number_lastpage) / max_thread
setPagePart = setPageDivided * (index_thread + 1)
firstPagePart = (setPagePart - setPageDivided) + 1
self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage)))
self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart)))
self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart)))
for j in range(int(firstPagePart),int(setPagePart)):
paging = j * 10
categorie = urlparse(i).path.split("/")
url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging)
if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging)
self._logger.info("{0} : {1}".format(self._name, url_paging))
if url_paging not in webpage:
webpage.append(url_paging)
page = self._request.get(url_paging)
if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser)
h2 = soup.find_all("h2")
for title in h2:
href = title.find_all("a")[0].get("href", "/")
if href not in webpage:
try:
o = urlparse(href)
o = o._replace(scheme="https").geturl()
except Exception as err:
self._logger.error("parsing error : {0}".format(err))
exit(1)
webpage.append(o)
else:
self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content)
return webpage
# Private method
#
# Create path
def _mkdirPath(self, path_dir):
if not os.path.exists(path_dir):
makedir = []
pathh = path_dir.split("/")
for i in pathh:
makedir.append(i)
repath = "/".join(makedir)
if not os.path.exists(repath):
self._logger.debug("{0} : Dossier crée : {1}".format(self._name, repath))
try:
if len(repath) > 0:
os.mkdir(repath)
except Exception as err:
self._logger.error("Directory error : {0}".format(err))
self._logger.debug("Directory error : {0} {1} {2} {3} {4}".format(err, path_dir, repath, pathh, makedir))
exit(1)
# Get Css and JS
def _getScriptCss(self, js, css):
try:
page = self._request.get(self._url)
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
page_url = []
if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser)
if js is True:
script = soup.find_all("script")
for anchor in script:
src = anchor.get("src", "/")
if src != "/":
try:
u = urlparse(self._url)
o = urlparse(src)
except Exception as err:
self._logger.error("parsing error : {0}".format(err))
exit(1)
if o.netloc == "":
o = o._replace(netloc=u.netloc)
o = o._replace(scheme=u.scheme)
page_url.append(o.geturl())
if css is True:
link = soup.find_all("link")
for anchor in link:
rel = anchor.get("rel")
if rel[0] == "stylesheet":
href = anchor.get("href", "/")
if href != "/":
try:
u = urlparse(self._url)
o = urlparse(href)
except Exception as err:
self._logger.error("parsing error : {0}".format(err))
exit(1)
if o.netloc == "":
o = o._replace(netloc=u.netloc)
o = o._replace(scheme=u.scheme)
page_url.append(o.geturl())
else:
self._logger.error("JS or CSS did not get due status code : {0}".format(page.status_code))
self._logger.debug(page.content)
return page_url
# Get image
def _getImg(self, webpage):
page_img = []
for i in webpage:
try:
page = self._request.get(i)
except Exception as err:
self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser)
img = soup.find_all("img")
self._logger.info("{0} : image from page: {1} : ".format(self._name,i))
for anchor in img:
src = anchor.get("src", "/")
if src != "/":
if src not in page_img:
self._logger.info("{0} : image: {1} : ".format(self._name, src))
page_img.append(src)
else:
self._logger.error("{0} : Image did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
return page_img
# Download page
def _downloadPage(self, webpage, backup_dir):
for i in range(0, len(webpage)):
try:
o = urlparse(webpage[i])
except Exception as err:
self._logger.error("parsing error : {0}".format(err))
exit(1)
path_web = o.path.split("/")
filePageWeb = path_web[len(path_web)-1]
path_web.pop(len(path_web)-1)
dir_page_web = "/".join(path_web)
self._mkdirPath("{0}/{1}/{2}".format(backup_dir, o.netloc, dir_page_web))
try:
r = self._request.get(webpage[i])
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if r.status_code == 200:
fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc)
if len(dir_page_web) > 0 and len(filePageWeb) > 0:
fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb)
self._logger.info("{0} : {1}/{2} : {3}".format(self._name, i+1, len(webpage), fileDownload))
try:
open(fileDownload, "wb").write(r.content)
except Exception as err:
self._logger.error("file error : {0}".format(err))
exit(1)
else:
self._logger.error("Not download due status code : {0}".format(r.status_code))
self._logger.debug(r.content)

531
lib/WPImport.py Normal file
View File

@@ -0,0 +1,531 @@
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPimport:
# Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser"):
self._name = name
self._basic = basic
self._wordpress = wordpress
self._logger = logger
self._parser = parser
self._headers_json = {'Content-Type': 'application/json', 'Accept':'application/json'}
self._request = requests.Session()
retries = Retry(total=10,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Public method
def setUrl(self, wordpress):
self._wordpress = wordpress
def fromUrl(self, webpage):
for i in range(0, len(webpage)):
try:
r = self._request.get(webpage[i])
except Exception as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
if r.status_code == 200:
self._logger.info("{0} : ({1}/{2} : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
soup = BeautifulSoup(r.content, self._parser)
articlebody = soup.find_all("div", class_="articlebody")
if len(articlebody) > 0:
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
else:
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
def fromDirectory(self, directory="", number_thread=1, max_thread=1):
directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory))
if len(directories) > 0:
files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files=[], number_thread=1, max_thread=1):
divFiles = int(len(files) / max_thread)
currentRangeFiles = int(divFiles * (number_thread+1))
firstRange = int(currentRangeFiles - divFiles)
self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
for i in range(firstRange, currentRangeFiles):
if os.path.exists(files[i]):
self._logger.info("{0} : File is being processed : {1}".format(self._name, files[i]))
with open(files[i], 'r') as f:
content = f.read()
soup = BeautifulSoup(content, self._parser)
articlebody = soup.find_all("div", class_="articlebody")
self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody)))
if len(articlebody) > 0:
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
# Private method
## Get all files
def _getFiles(self, item):
files = []
for i in item:
for j in os.listdir(i):
if os.path.isfile("{0}/{1}".format(i, j)):
files.append("{0}/{1}".format(i, j))
return files
## Get directories
def _getDirectories(self, subdirectory, item):
sub = subdirectory
for i in os.listdir(item):
if os.path.isdir("{0}/{1}".format(item, i)):
sub.append("{0}/{1}".format(item, i))
subdirectory = self._getDirectories(sub, "{0}/{1}".format(item, i))
return subdirectory
## Add or update featured media
def _addOrUpdateFeaturedMedia(self, soup):
item_div = soup.find_all("div", {"data-edittype": "post"})
for i in item_div:
h2 = i.find_all("h2")[0].text
params = {"search":h2, "type":"post"}
try:
page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
result = page.json()
if len(result) > 0:
if h2 == result[0]["title"]:
img = i.find_all("img")
if len(img) > 0:
img_src = img[0].get("src")
try:
page = self._request.get(img_src)
except Exception as err:
self._logger.error("{0} : Connection error for get featured media : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
name_img = img_src.replace("_q", "")
name_img = name_img.split("/")[len(name_img.split("/"))-1]
params = {"search": name_img}
try:
page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error search featured media : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
res = page.json()
if len(res) > 0:
id_media = res[0]["id"]
data = {"featured_media": id_media}
try:
r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except Exception as err:
self._logger.error("{0} : Connection error for post media featured : {1}".format(self._name, err))
exit(1)
if r.status_code == 200:
self._logger.info("{0} : Add media featured : {1}".format(self._name, r.json()["title"]["raw"]))
else:
self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
else:
self._logger.info("{0} : No media found for {1}".format(self._name, h2))
else:
self._logger.error("{0} : Connection error with status code for search featured media: {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Connection error for get featured media with status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
## Association image to post
def _linkImgPost(self, title, list_img, post_id):
for i in list_img:
data = {"post": post_id}
try:
r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for link image to post : {1}".format(self._name, err))
exit(1)
if r.status_code == 200:
self._logger.info("{0} : Link image to post {1}".format(self._name, title))
else:
self._logger.error("{0} Connection error with status code for link image to post : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
## Add or update img
def _addOrUpdateMedia(self, href_img, page):
media = {"id":"", "rendered":""}
split_fileimg = href_img.split("/")
img_name = split_fileimg[len(split_fileimg)-1]
params = { "search": img_name}
try:
r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err))
exit(1)
if r.status_code == 200:
res = r.json()
if len(res) > 0:
params = {"force":1}
try:
r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err))
exit(1)
if r.status_code == 200:
self._logger.info("{0} : Image removed {1}".format(self._name, img_name))
else:
self._logger.error("{0} : Image not removed due status code : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
data = page.content
img_type = "image/png"
if img_name.split(".")[1] == "jpg" or img_name.split(".")[1] == "jpeg":
img_type = "image/jpg"
headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)}
try:
r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err))
exit(1)
if r.status_code == 201:
self._logger.info("{0} : Image added {1}".format(self._name, img_name))
res = r.json()
media["id"] = res["id"]
media["rendered"] = res["guid"]["rendered"]
else:
self._logger.error("{0} : Image not added due status code : {1}".format(self._name, r.status_code))
self._logger.debug(r.content)
else:
self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
return media
## Add or update comment
def _addOrUpdateComment(self, post, comment, title):
for i in comment:
try:
params = {"post": post, "author_name":i["author"], "date":i["date"]}
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error for search comment : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
result = page.json()
for j in result:
try:
params = {"force":1}
page = self._request.delete("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"]), params=params, auth=self._basic)
except Exception as err:
self._logger.error("{0} : Connection error for delete comment : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
self._logger.info("{0} : Comment deleted for {1}".format(self._name, title))
self._logger.debug("{0} : Comment deleted : {1}".format(self._name, j))
else:
self._logger.error("{0} : Comment not deleted for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Comment not listed for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
for i in comment:
data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"}
if i["parent_id"] != -1:
parent_id = int(i["parent_id"])
params = {"post": post, "author_name":comment[parent_id]["author"], "date":comment[parent_id]["date"]}
try:
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error for parent comment : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
result = page.json()
if len(result) > 0:
data["parent"]=result[0]["id"]
else:
self._logger.error("{0} : Connection error for parent comment with status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
try:
page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for add comment : {1}".format(self._name, err))
exit(1)
if page.status_code == 201:
self._logger.info("{0} : Comment added for {1}".format(self._name, title))
self._logger.debug("{0} : Data : {1}".format(self._name, data))
else:
self._logger.error("{0} : Comment not added for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
## Check class name
def _hasClassName(self, tag, className):
for i in tag["class"]:
if i == className:
return True
return False
## Get class name
def _getClassName(self, tag, className):
for i in tag["class"]:
if re.match(className, i):
return i
return ""
## Get all comments
def _getComment(self, comment):
comment_post = []
for i in range(0, len(comment)):
comment_div = comment[i].find("div", class_="comment_item")
comment_item = comment_div.text.split("\n")
footer = comment_div.find_all("div", class_="itemfooter")
comment_author = footer[0].text.split(",")[0].replace("Posté par ", "")
comment_date = footer[0].find_all("abbr")[0].get("title")
comment_content = "<p>"
for j in range(0, len(comment_item)-2):
if len(comment_item[j]) > 0:
comment_content = comment_content + comment_item[j] + "<br />"
comment_content = comment_content + "</p>"
parent = -1
if self._hasClassName(comment[i], "level-1") is False:
block = False
className = self._getClassName(comment[i], "level-").split("-")
level = 1
if len(className) > 0:
level = int(className[1])
for j in range(i-1, 0, -1):
if block is False:
levelName = "level-{0}".format(level - 1)
if self._hasClassName(comment[j], levelName) is True:
parent = j
block = True
comment_post.append({"author": comment_author, "date": comment_date, "content": comment_content, "parent_id":parent})
return comment_post
## Add or Update post
def _addOrUpdatePost(self, soup):
tags = []
month = {"janvier":"01", "février": "02", "mars": "03", "avril":"04", "mai": "05", "juin": "06", "juillet": "07", "août": "08", "septembre": "09", "octobre": "10", "novembre": "11", "décembre": "12"}
liste = ["categories", "tags"]
elements = {}
element = {}
listelement = {}
for i in liste:
element[i] = []
listelement[i] = []
articletitle = soup.find_all("h2", class_="articletitle")
articlebody = soup.find_all("div", class_="articlebody")
articledate = soup.find_all("span", class_="articledate")
articleacreator = soup.find_all("span", class_="articlecreator")
dateheader = soup.find_all("div", class_="dateheader")
itemfooter = soup.find_all("div", class_="itemfooter")
comment = soup.find_all("li", class_="comment")
img_a = articlebody[0].find_all("a", {"target": "_blank"})
list_img = []
for i in img_a:
new_img = {}
img = i.find_all("img")
if len(img) > 0:
href_a = i.get("href")
href_img = img[0].get("src")
new_img["old_src"]=href_img
new_img["old_href"]=href_a
try:
page_img = self._request.get(href_img)
except Exception as err:
self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1)
if page_img.status_code == 404:
href_img = href_a
try:
page_img = self._request.get(href_a)
except Exception as err:
self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1)
if page_img.status_code == 200:
media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"]
new_img["new_src"]=media["rendered"]
list_img.append(new_img)
if href_img != href_a:
media=self._addOrUpdateMedia(href_a, page_img)
new_img["id"]=media["id"]
new_img["new_src"]=media["rendered"]
list_img.append(new_img)
if page_img.status_code not in [200, 404]:
self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code))
self._logger.debug("{0} : {1}".format(self._name, page_img.content))
comment_post = self._getComment(comment)
a = itemfooter[0].find_all("a", {"rel": True})
for i in a:
rel = i.get("rel")
if rel[0] == 'tag':
href = i.get("href")
if re.search(r'/tag/', href):
element["tags"].append(i.text)
if re.search(r'/archives/', href):
element["categories"].append(i.text)
for i in liste:
for j in element[i]:
element_exist = False
try:
params = {"params":j}
page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err))
exit(1)
if page.status_code == 200:
element_exist = True
result = page.json()
listelement[i].append(result[0]["id"])
else:
self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
if element_exist is False:
data = {"name": j}
self._logger.debug("{0} : URL : {1} ".format("http://{1}/wp-json/wp/v2/{2}".format(self._name, self._wordpress, i)))
self._logger.debug("{0} : data : {1}".format(self._name, data))
self._logger.debug("{0} : headers : {1}".format(self._name, self._headers_form))
try:
page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for post {1} : {2}".format(self._name, i, err))
exit(1)
if page.status_code == 201:
result = page.json()
listelement[i].append(result["id"])
else:
self._logger.error("{0} : {1} not added due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
title = articletitle[0].text
author = articleacreator[0].text.lower()
body = articlebody[0].find_all("p")
bodyhtml = "<p>"
for i in body:
if len(i.text) == 1:
bodyhtml = bodyhtml + "<br />"
else:
bodyhtml = bodyhtml + str(i).replace("<p>", "").replace("</p>", "").replace("<br>", "<br />") + "<br />"
bodyhtml = bodyhtml + "</p>"
for i in list_img:
o = urlparse(i["new_src"])
bodyhtml = bodyhtml.replace(i["old_href"], o.path)
bodyhtml = bodyhtml.replace(i["old_src"], o.path)
hour = articledate[0].text
time = dateheader[0].text.split(" ")
data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]}
params = {"search":author}
try:
page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
result = page.json()
data["author"] = result[0]["id"]
else:
self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(page.content))
params = {"search":title}
try:
page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err))
exit(1)
page_exist = True
headers = {'Content-Type': 'application/json', 'Accept':'application/json'}
if page.status_code == 200:
result = page.json()
if len(result) == 0:
page_exist = False
else:
self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title))
post_id = result[0]["id"]
try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data))
except Exception as err:
self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
result = page.json()
self._logger.info("{0} : Post updated : {1}".format(self._name, result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else:
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
if page_exist == False:
try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data))
except Exception as err:
self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err))
exit(1)
if page.status_code == 201:
result = page.json()
self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else:
self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))

0
lib/__init__.py Normal file
View File

View File

@@ -1,241 +0,0 @@
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, argparse, logging
def mkdirPath(path_dir, logger):
if not os.path.exists(path_dir):
makedir = []
pathh = path_dir.split("/")
for i in pathh:
makedir.append(i)
repath = "/".join(makedir)
if not os.path.exists(repath):
logger.debug("Dossier crée : {0}".format(repath))
try:
if len(repath) > 0:
os.mkdir(repath)
except Exception as err:
logger.error("Directory error : {0}".format(err))
logger.debug("Directory error : {0} {1} {2} {3} {4}".format(err, path_dir, repath, pathh, makedir))
exit(1)
def getScriptCss(url, js, css, logger):
try:
page = requests.get(url)
except Exception as err:
logger.error("Connection error : {0}".format(err))
exit(1)
page_url = []
if page.status_code == 200:
soup = BeautifulSoup(page.text, 'html.parser')
if js is True:
script = soup.find_all("script")
for anchor in script:
src = anchor.get("src", "/")
if src != "/":
try:
u = urlparse(url)
o = urlparse(src)
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
if o.netloc == "":
o = o._replace(netloc=u.netloc)
o = o._replace(scheme=u.scheme)
page_url.append(o.geturl())
if css is True:
link = soup.find_all("link")
for anchor in link:
rel = anchor.get("rel")
if rel[0] == "stylesheet":
href = anchor.get("href", "/")
if href != "/":
try:
u = urlparse(url)
o = urlparse(href)
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
if o.netloc == "":
o = o._replace(netloc=u.netloc)
o = o._replace(scheme=u.scheme)
page_url.append(o.geturl())
return page_url
def getImg(webpage, logger):
page_img = []
for i in webpage:
try:
page = requests.get(i)
except Exception as err:
logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
soup = BeautifulSoup(page.text, 'html.parser')
img = soup.find_all("img")
logger.info("image from page: {0} : ".format(i))
for anchor in img:
src = anchor.get("src", "/")
if src != "/":
if src not in page_img:
logger.info("image: {0} : ".format(src))
page_img.append(src)
return page_img
def getUrlPage(url, logger):
try:
page = requests.get(url)
except Exception as err:
logger.error("Connection error : {0}".format(err))
exit(1)
page_url = []
if page.status_code == 200:
soup = BeautifulSoup(page.text, 'html.parser')
ul = soup.find_all("ul", id="listsmooth")
for anchor in ul[0].find_all("a"):
href = anchor.get('href', '/')
if href != "#":
page_url.append(href)
webpage = []
for i in page_url:
try:
page = requests.get(i)
except Exception as err:
logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
logger.info("page : {0}".format(i))
if i not in webpage:
webpage.append(i)
soup = BeautifulSoup(page.text, 'html.parser')
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
if len(class_div) > 0:
pagingfirstline = class_div[0].find_all("a")
if len(pagingfirstline) > 1:
lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/")
element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
number_page = element_lastpage.split("-")[0].split("p")[1]
number_lastpage = int(number_page) / 10
for j in range(1,int(number_lastpage)):
paging = j * 10
categorie = urlparse(i).path.split("/")
url_paging = "{0}/archives/p{1}-10.html".format(url, paging)
if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(url, categorie[2], paging)
logger.info(url_paging)
if url_paging not in webpage:
webpage.append(url_paging)
page = requests.get(url_paging)
if page.status_code == 200:
soup = BeautifulSoup(page.text, 'html.parser')
h2 = soup.find_all("h2")
for title in h2:
href = title.find_all("a")[0].get("href", "/")
if href not in webpage:
try:
o = urlparse(href)
o = o._replace(scheme="https").geturl()
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
webpage.append(o)
return webpage
def downloadPage(webpage, backup_dir, logger):
for i in range(0, len(webpage)):
try:
o = urlparse(webpage[i])
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
path_web = o.path.split("/")
filePageWeb = path_web[len(path_web)-1]
path_web.pop(len(path_web)-1)
dir_page_web = "/".join(path_web)
mkdirPath("{0}/{1}/{2}".format(backup_dir, o.netloc, dir_page_web), logger)
try:
r = requests.get(webpage[i])
except Exception as err:
logger.error("Connection error : {0}".format(err))
exit(1)
if r.status_code == 200:
fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc)
if len(dir_page_web) > 0 and len(filePageWeb) > 0:
fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb)
logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload))
try:
open(fileDownload, "wb").write(r.content)
except Exception as err:
logger.error("file error : {0}".format(err))
exit(1)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--url", help="canblog URL to be scraping", required=True)
parser.add_argument("--dir",
default="backup",
help="backup file path")
parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--no-css", help="No CSS", dest="css", action="store_true")
parser.add_argument("--no-js", help="No JS", dest="js", action="store_true")
parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
parser.add_argument("--quiet", help="No console output", action="store_true")
args = parser.parse_args()
logger = logging.getLogger('web_scrap')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.quiet is False:
ch = logging.StreamHandler()
if args.debug is True:
logger.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
if len(args.logfile) > 0:
fileHandler = logging.FileHandler(args.logfile)
if args.debug is True:
fileHandler.setLevel(logging.DEBUG)
else:
fileHandler.setLevel(logging.INFO)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)
try:
o = urlparse(args.url)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
if args.js is False:
script = getScriptCss(url, True, False, logger)
downloadPage(script, "{0}/{1}/{2}".format(args.dir, o.path, "dists/js"), logger)
if args.css is False:
css = getScriptCss(url, False, True, logger)
downloadPage(css, "{0}/{1}/{2}".format(args.dir, o.path, "dists/css"), logger)
if args.html is False or args.img is False:
webpage = getUrlPage(url, logger)
if args.html is False:
downloadPage(webpage, args.dir, logger)
if args.img is False:
page_src = getImg(webpage, logger)
downloadPage(page_src, "{0}/{1}/{2}".format(args.dir, o.path, "img"), logger)