Merge pull request 'thread' (#9) from thread into master

Reviewed-on: #9
This commit is contained in:
v4l3n71n 2023-05-01 20:05:02 +00:00
commit 3161a06459
4 changed files with 258 additions and 165 deletions

4
.gitignore vendored
View File

@ -1,4 +1,4 @@
backup/ backup*/
backup1/ wp-navigation
web_scrap.log web_scrap.log
__pycache__/ __pycache__/

View File

@ -2,17 +2,95 @@
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
from getpass import getpass from getpass import getpass
from urllib.parse import urlparse from urllib.parse import urlparse
import argparse, logging from concurrent import futures
import argparse, logging, threading
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
def download(name_thread, max_thread, url, logger, parser, directory, html, img):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
webpage = exportWp.getUrlPage(name_thread, max_thread)
if html is False:
exportWp.downloadHTML(webpage)
if args.img is False:
exportWp.downloadImg(webpage)
del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
del exportWp
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser)
importWp.fromUrl(webpage)
del importWp
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromUrl(webpage)
del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial):
name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",")
wordpress = wordpress.split(",")
if serial is False:
for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser)
for j in directory:
importWp.fromDirectory(j, name_thread, max_thread)
del importWp
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromDirectory(directory[i])
del importWp
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Verbosity", action="store_true") parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
@ -76,64 +154,31 @@ if __name__ == '__main__':
if len(args.file) > 0: if len(args.file) > 0:
for i in wordpress: for i in wordpress:
importWp.setUrl(i) importWp.setUrl(i)
importWp.fromFile(args.file.split(",")) importWp.fromFile(files=args.file.split(","))
exit(0)
if len(args.directory) > 0: if len(args.directory) > 0:
directory = args.directory.split(",") try:
if args.serial is False: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
for i in wordpress: wait_for = [
importWp.setUrl(i) ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial)
for j in directory: for i in range(0, int(args.parallel))
importWp.fromDirectory(j) ]
else: except Exception as err:
if len(directory) != len(wordpress): logger.error("Threading error : {0}".format(err))
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress")
exit(1)
for i in range(0, len(wordpress)-1):
importWp.setUrl(wordpress[i])
importWp.fromDirectory(directory[i])
exit(0)
if len(args.canalblog) > 0: if len(args.canalblog) > 0:
exportWp = WPExport("", logger, args.parser, args.directory) try:
canalblog = args.canalblog.split(",") with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wordpress = args.wordpress.split(",") wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial)
if args.serial is False: for i in range(0, int(args.parallel))
for canal in canalblog: ]
try: except Exception as err:
o = urlparse(canal) logger.error("Threading error : {0}".format(err))
o = o._replace(scheme="https") exit(0)
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
for j in wordpress:
importWp.setUrl(j)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress")
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
importWp.setUrl(wordpress[i])
importWp.fromUrl(webpage)
if args.command == "export": if args.command == "export":
canalblog = args.url.split(",") canalblog = args.url.split(",")
exportWp = WPExport("", logger, args.parser, args.directory) exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory)
for canal in canalblog: for canal in canalblog:
try: try:
o = urlparse(canal) o = urlparse(canal)
@ -148,12 +193,17 @@ if __name__ == '__main__':
if args.css is False: if args.css is False:
exportWp.downloadCss() exportWp.downloadCss()
del exportWp
if args.html is False or args.img is False: if args.html is False or args.img is False:
webpage = exportWp.getUrlPage() try:
if args.html is False: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
exportWp.downloadHTML(webpage) wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img)
if args.img is False: for i in range(0, int(args.parallel))
exportWp.downloadImg(webpage) ]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0) exit(0)

View File

@ -6,11 +6,12 @@ from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
class WPExport: class WPExport:
def __init__(self, url, logger, parser, directory): def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup"):
self._url = url self._url = url
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
self._dir = directory self._dir = directory
self._name = name
self._request = requests.Session() self._request = requests.Session()
@ -20,8 +21,17 @@ class WPExport:
self._request.mount('http://', HTTPAdapter(max_retries=retries)) self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
self._logger.info("{0} : Export finished for {1}".format(self._name, self._url))
# Public method # Public method
# Set name
def setName(self, name):
self._name = "Thread-{0}".format(int(name) + 1)
# Set URL # Set URL
def setUrl(self, url): def setUrl(self, url):
@ -55,11 +65,11 @@ class WPExport:
# Get URL # Get URL
def getUrlPage(self): def getUrlPage(self, index_thread, max_thread):
try: try:
page = self._request.get(self._url) page = self._request.get(self._url)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
page_url = [] page_url = []
if page.status_code == 200: if page.status_code == 200:
@ -70,8 +80,8 @@ class WPExport:
if href != "#": if href != "#":
page_url.append(href) page_url.append(href)
else: else:
self._logger.error("Url did not get due status code : {0}".format(page.status_code)) self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
webpage = [] webpage = []
@ -79,10 +89,10 @@ class WPExport:
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("page : {0}".format(i)) self._logger.info("{0} : page : {1}".format(self._name, i))
if i not in webpage: if i not in webpage:
webpage.append(i) webpage.append(i)
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
@ -94,13 +104,22 @@ class WPExport:
element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
number_page = element_lastpage.split("-")[0].split("p")[1] number_page = element_lastpage.split("-")[0].split("p")[1]
number_lastpage = int(number_page) / 10 number_lastpage = int(number_page) / 10
for j in range(1,int(number_lastpage)):
setPageDivided = int(number_lastpage) / max_thread
setPagePart = setPageDivided * (index_thread + 1)
firstPagePart = (setPagePart - setPageDivided)
self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage)))
self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart)))
self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart)))
for j in range(int(firstPagePart),int(setPagePart)):
paging = j * 10 paging = j * 10
categorie = urlparse(i).path.split("/") categorie = urlparse(i).path.split("/")
url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging) url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging)
if len(categorie) > 2: if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging)
self._logger.info(url_paging) self._logger.info("{0} : {1}".format(self._name, url_paging))
if url_paging not in webpage: if url_paging not in webpage:
webpage.append(url_paging) webpage.append(url_paging)
page = self._request.get(url_paging) page = self._request.get(url_paging)
@ -118,7 +137,7 @@ class WPExport:
exit(1) exit(1)
webpage.append(o) webpage.append(o)
else: else:
self._logger.error("web didn't get due status code : {0}".format(page.status_code)) self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug(page.content)
return webpage return webpage
@ -135,7 +154,7 @@ class WPExport:
makedir.append(i) makedir.append(i)
repath = "/".join(makedir) repath = "/".join(makedir)
if not os.path.exists(repath): if not os.path.exists(repath):
self._logger.debug("Dossier crée : {0}".format(repath)) self._logger.debug("{0} : Dossier crée : {1}".format(self._name, repath))
try: try:
if len(repath) > 0: if len(repath) > 0:
os.mkdir(repath) os.mkdir(repath)
@ -201,21 +220,21 @@ class WPExport:
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
img = soup.find_all("img") img = soup.find_all("img")
self._logger.info("image from page: {0} : ".format(i)) self._logger.info("{0} : image from page: {1} : ".format(self._name,i))
for anchor in img: for anchor in img:
src = anchor.get("src", "/") src = anchor.get("src", "/")
if src != "/": if src != "/":
if src not in page_img: if src not in page_img:
self._logger.info("image: {0} : ".format(src)) self._logger.info("{0} : image: {1} : ".format(self._name, src))
page_img.append(src) page_img.append(src)
else: else:
self._logger.error("Image did not get due status code : {0}".format(page.status_code)) self._logger.error("{0} : Image did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
return page_img return page_img
@ -243,7 +262,7 @@ class WPExport:
fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc) fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc)
if len(dir_page_web) > 0 and len(filePageWeb) > 0: if len(dir_page_web) > 0 and len(filePageWeb) > 0:
fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb) fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb)
self._logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) self._logger.info("{0} : {1}/{2} : {3}".format(self._name, i+1, len(webpage), fileDownload))
try: try:
open(fileDownload, "wb").write(r.content) open(fileDownload, "wb").write(r.content)
except Exception as err: except Exception as err:

View File

@ -8,7 +8,8 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, basic, wordpress, logger, parser): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser"):
self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
self._logger = logger self._logger = logger
@ -22,6 +23,11 @@ class WPimport:
self._request.mount('http://', HTTPAdapter(max_retries=retries)) self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
self._logger.info("{0} : Import finished for {1}".format(self._name, self._wordpress))
# Public method # Public method
def setUrl(self, wordpress): def setUrl(self, wordpress):
@ -32,10 +38,10 @@ class WPimport:
try: try:
r = self._request.get(webpage[i]) r = self._request.get(webpage[i])
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("({0}/{1} : Page en cours d'import : {2}".format(i+1, len(webpage), webpage[i])) self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
soup = BeautifulSoup(r.content, self._parser) soup = BeautifulSoup(r.content, self._parser)
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
if len(articlebody) > 0: if len(articlebody) > 0:
@ -43,25 +49,38 @@ class WPimport:
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
def fromDirectory(self, directory): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
files = self._getFiles(directories) if len(directories) > 0:
self.fromFile(files) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files): def fromFile(self, files=[], number_thread=1, max_thread=1):
for file in files: divFiles = int(len(files) / max_thread)
if os.path.exists(file): currentRangeFiles = int(divFiles * (number_thread+1))
self._logger.info("Fichier en cours de traitement : {0}".format(file)) firstRange = int(currentRangeFiles - divFiles)
with open(file, 'r') as f: self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
for i in range(firstRange, currentRangeFiles):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i]))
with open(files[i], 'r') as f:
content = f.read() content = f.read()
self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content)))
soup = BeautifulSoup(content, self._parser) soup = BeautifulSoup(content, self._parser)
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody)))
if len(articlebody) > 0: if len(articlebody) > 0:
self._addOrUpdatePost(soup) self._addOrUpdatePost(soup)
else: else:
@ -99,7 +118,7 @@ class WPimport:
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
@ -111,7 +130,7 @@ class WPimport:
try: try:
page = self._request.get(img_src) page = self._request.get(img_src)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get featured media : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
name_img = img_src.replace("_q", "") name_img = img_src.replace("_q", "")
@ -120,7 +139,7 @@ class WPimport:
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error search featured media : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
res = page.json() res = page.json()
@ -130,27 +149,27 @@ class WPimport:
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for post media featured : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Ajout media featured : {0}".format(r.json()["title"]["raw"])) self._logger.info("{0} : Add media featured : {1}".format(self._name, r.json()["title"]["raw"]))
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
else: else:
self._logger.info("Aucun media trouvé pour {0}".format(h2)) self._logger.info("{0} : No media found for {1}".format(self._name, h2))
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error with status code for search featured media: {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error for get featured media with status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
## Association image to post ## Association image to post
@ -161,13 +180,13 @@ class WPimport:
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data) r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for link image to post : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Association d'une image à l'article {0}".format(title)) self._logger.info("{0} : Link image to post {1}".format(self._name, title))
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} Connection error with status code for link image to post : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
## Add or update img ## Add or update img
@ -176,12 +195,14 @@ class WPimport:
media = {"id":"", "rendered":""} media = {"id":"", "rendered":""}
split_fileimg = href_img.split("/") split_fileimg = href_img.split("/")
img_name = split_fileimg[len(split_fileimg)-1] img_name = split_fileimg[len(split_fileimg)-1]
self._logger.debug("{0} : Search for image {1} with URL {2}".format(self._name, img_name, "http://{0}/wp-json/wp/v2/media".format(self._wordpress)))
params = { "search": img_name} params = { "search": img_name}
try: try:
r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err))
exit(1) exit(1)
self._logger.debug("{0} : Search for image {1} and his status code {2}".format(self._name, img_name, r.status_code))
if r.status_code == 200: if r.status_code == 200:
res = r.json() res = r.json()
if len(res) > 0: if len(res) > 0:
@ -189,13 +210,13 @@ class WPimport:
try: try:
r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params) r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Image removed {0}".format(img_name)) self._logger.info("{0} : Image removed {1}".format(self._name, img_name))
else: else:
self._logger.error("Image not removed due status code : {0}".format(r.status_code)) self._logger.error("{0} : Image not removed due status code : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
data = page.content data = page.content
img_type = "image/png" img_type = "image/png"
@ -205,20 +226,20 @@ class WPimport:
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 201: if r.status_code == 201:
self._logger.info("Image added {0}".format(img_name)) self._logger.info("{0} : Image added {1}".format(self._name, img_name))
res = r.json() res = r.json()
media["id"] = res["id"] media["id"] = res["id"]
media["rendered"] = res["guid"]["rendered"] media["rendered"] = res["guid"]["rendered"]
else: else:
self._logger.error("Image not added due status code : {0}".format(r.status_code)) self._logger.error("{0} : Image not added due status code : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug(r.content)
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
return media return media
@ -231,7 +252,7 @@ class WPimport:
params = {"post": post, "author_name":i["author"], "date":i["date"]} params = {"post": post, "author_name":i["author"], "date":i["date"]}
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for search comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
@ -240,18 +261,18 @@ class WPimport:
params = {"force":1} params = {"force":1}
page = self._request.delete("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"]), params=params, auth=self._basic) page = self._request.delete("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"]), params=params, auth=self._basic)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for delete comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("Comment deleted for {0}".format(title)) self._logger.info("{0} : Comment deleted for {1}".format(self._name, title))
self._logger.debug("Comment deleted : {0}".format(j)) self._logger.debug("{0} : Comment deleted : {1}".format(self._name, j))
else: else:
self._logger.error("Comment not deleted for {0} due status code : {1}".format(title, page.status_code)) self._logger.error("{0} : Comment not deleted for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Comment not listed for {0} due status code : {1}".format(title, page.status_code)) self._logger.error("{0} : Comment not listed for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
for i in comment: for i in comment:
data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"} data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"}
@ -262,27 +283,27 @@ class WPimport:
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for parent comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
if len(result) > 0: if len(result) > 0:
data["parent"]=result[0]["id"] data["parent"]=result[0]["id"]
else: else:
self._logger.error("Connection error for parent comment with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error for parent comment with status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data) page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for add comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 201: if page.status_code == 201:
self._logger.info("Comment added for {0}".format(title)) self._logger.info("{0} : Comment added for {1}".format(self._name, title))
self._logger.debug("Data : {0}".format(data)) self._logger.debug("{0} : Data : {1}".format(self._name, data))
else: else:
self._logger.error("Comment not added for {0} due status code : {1}".format(title, page.status_code)) self._logger.error("{0} : Comment not added for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
## Check class name ## Check class name
@ -346,6 +367,7 @@ class WPimport:
listelement[i] = [] listelement[i] = []
articletitle = soup.find_all("h2", class_="articletitle") articletitle = soup.find_all("h2", class_="articletitle")
self._logger.debug("{0} : Title of the article : {1}".format(self._name, articletitle))
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
articledate = soup.find_all("span", class_="articledate") articledate = soup.find_all("span", class_="articledate")
articleacreator = soup.find_all("span", class_="articlecreator") articleacreator = soup.find_all("span", class_="articlecreator")
@ -353,10 +375,12 @@ class WPimport:
itemfooter = soup.find_all("div", class_="itemfooter") itemfooter = soup.find_all("div", class_="itemfooter")
comment = soup.find_all("li", class_="comment") comment = soup.find_all("li", class_="comment")
img_a = articlebody[0].find_all("a", {"target": "_blank"}) img_a = articlebody[0].find_all("a", {"target": "_blank"})
self._logger.debug("{0} : Number of image's link : {1}".format(self._name, len(img_a)))
list_img = [] list_img = []
for i in img_a: for i in img_a:
new_img = {} new_img = {}
img = i.find_all("img") img = i.find_all("img")
self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img)))
if len(img) > 0: if len(img) > 0:
href_a = i.get("href") href_a = i.get("href")
href_img = img[0].get("src") href_img = img[0].get("src")
@ -365,16 +389,16 @@ class WPimport:
try: try:
page_img = self._request.get(href_img) page_img = self._request.get(href_img)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
if page_img.status_code == 404: if page_img.status_code == 404:
href_img = href_a href_img = href_a
try: try:
page_img = self._request.get(href_a) page_img = self._request.get(href_a)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code))
if page_img.status_code == 200: if page_img.status_code == 200:
media=self._addOrUpdateMedia(href_img, page_img) media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"] new_img["id"]=media["id"]
@ -386,10 +410,10 @@ class WPimport:
new_img["new_src"]=media["rendered"] new_img["new_src"]=media["rendered"]
list_img.append(new_img) list_img.append(new_img)
if page_img.status_code not in [200, 404]: if page_img.status_code not in [200, 404]:
self._logger.error("Connection error with status code : {0}".format(page_img.status_code)) self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code))
self._logger.debug(page_img.content) self._logger.debug("{0} : {1}".format(self._name, page_img.content))
self._logger.debug("{0} : Number of image : {1}".format(self._name, len(list_img)))
comment_post = self._getComment(comment) comment_post = self._getComment(comment)
a = itemfooter[0].find_all("a", {"rel": True}) a = itemfooter[0].find_all("a", {"rel": True})
@ -408,7 +432,7 @@ class WPimport:
params = {"params":j} params = {"params":j}
page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
element_exist = True element_exist = True
@ -416,26 +440,26 @@ class WPimport:
listelement[i].append(result[0]["id"]) listelement[i].append(result[0]["id"])
else: else:
self._logger.error("{0} not found due status code : {1}".format(i, page.status_code)) self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
if element_exist is False: if element_exist is False:
data = {"name": j} data = {"name": j}
self._logger.debug("URL : {0} ".format("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i))) self._logger.debug("{0} : URL : {1} ".format("http://{1}/wp-json/wp/v2/{2}".format(self._name, self._wordpress, i)))
self._logger.debug("data : {0}".format(data)) self._logger.debug("{0} : data : {1}".format(self._name, data))
self._logger.debug("headers : {0}".format(self._headers_form)) self._logger.debug("{0} : headers : {1}".format(self._name, self._headers_form))
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=data) page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for post {1} : {2}".format(self._name, i, err))
exit(1) exit(1)
if page.status_code == 201: if page.status_code == 201:
result = page.json() result = page.json()
listelement[i].append(result["id"]) listelement[i].append(result["id"])
else: else:
self._logger.error("{0} not added due status code : {1}".format(i, page.status_code)) self._logger.error("{0} : {1} not added due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
title = articletitle[0].text title = articletitle[0].text
author = articleacreator[0].text.lower() author = articleacreator[0].text.lower()
@ -458,21 +482,21 @@ class WPimport:
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
data["author"] = result[0]["id"] data["author"] = result[0]["id"]
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(page.content))
params = {"search":title} params = {"search":title}
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err))
exit(1) exit(1)
page_exist = True page_exist = True
headers = {'Content-Type': 'application/json', 'Accept':'application/json'} headers = {'Content-Type': 'application/json', 'Accept':'application/json'}
@ -481,38 +505,38 @@ class WPimport:
if len(result) == 0: if len(result) == 0:
page_exist = False page_exist = False
else: else:
self._logger.info("La page {0} existe deja et mis à jour".format(title)) self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title))
post_id = result[0]["id"] post_id = result[0]["id"]
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data)) page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data))
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
self._logger.info("Article mis à jour : {0}".format(result["title"]["raw"])) self._logger.info("{0} : Post updated : {1}".format(self._name, result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else: else:
self._logger.error("Post not updated due status code : {0}".format(page.status_code)) self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Connection for update post error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
if page_exist == False: if page_exist == False:
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data))
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 201: if page.status_code == 201:
result = page.json() result = page.json()
self._logger.info("Article ajoute : {0}".format(result["title"]["raw"])) self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else: else:
self._logger.error("Post not added due status code : {0}".format(r.status_code)) self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))