14 Commits

4 changed files with 196 additions and 74 deletions

View File

@@ -5,15 +5,27 @@ from urllib.parse import urlparse
from concurrent import futures from concurrent import futures
from concurrent.futures import as_completed, wait, ALL_COMPLETED from concurrent.futures import as_completed, wait, ALL_COMPLETED
import argparse, logging, threading import argparse, logging, threading, os, glob
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
from lib.WPRemove import WPRemove from lib.WPRemove import WPRemove
from lib.WPChange import WPChange from lib.WPChange import WPChange
def change(index, number, args, logger): def errorRevert(logger, revert, tmp):
changeWp = WPChange(logger=logger, index_name=index, number_thread=number) if revert is True:
changeWp.fromDirectory(args.directory) files_tmp = glob.glob("{0}/*.json".format(tmp))
if len(files_tmp) == 0:
logger.error("Error revert, because files not found")
exit(1)
if len(files_tmp) != int(args.parallel):
for file_r in files_tmp:
os.remove(file_r)
logger.error("Error revert, because number files tmp is incompatible with parallel number")
exit(1)
def change(index, number, args, logger, tmp, revert):
changeWp = WPChange(logger=logger, index_name=index, number_thread=number, tmp=tmp)
changeWp.fromDirectory(args.directory, revert)
del changeWp del changeWp
@@ -40,21 +52,21 @@ def remove(index, number, args, basic, logger, ssl_wordpress):
del removeWp del removeWp
def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog): def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
for i in ["article", "page"]: for i in ["article", "page"]:
for j in ["publications", "principal"]: for j in ["publications", "principal"]:
if html is False: if html is False:
exportWp.downloadHTML(webpage[j][i]) exportWp.downloadHTML(j, i)
if img is False: if img is False:
exportWp.downloadImg(webpage[j][i]) exportWp.downloadImg(j, i)
del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image):
canalblog = canalblog.split(",") canalblog = canalblog.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
@@ -70,14 +82,15 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
except Exception as err: except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err)) logger.error("{0} : parsing error : {1}".format(name, err))
exit(1) exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp del exportWp
for j in wordpress: for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]: for k in ["article", "page"]:
for l in ["publications", "principal"]: for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k]) importWp.fromUrl(l, k)
del importWp del importWp
else: else:
@@ -93,9 +106,10 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
logger.error("parsing error : {0}".format(err)) logger.error("parsing error : {0}".format(err))
exit(1) exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]: for k in ["article", "page"]:
for l in ["publications", "principal"]: for l in ["publications", "principal"]:
@@ -104,7 +118,7 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
del importWp del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image): def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, revert):
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",") directory = directory.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
@@ -112,16 +126,16 @@ def importDirectory(name_thread, max_thread, directory, logger, parser, wordpres
for i in wordpress: for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
for j in directory: for j in directory:
importWp.fromDirectory(j, name_thread, max_thread) importWp.fromDirectory(j, name_thread, max_thread, revert)
del importWp del importWp
else: else:
if len(directory) != len(wordpress): if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name)) logger.error("{0} : Error : Number directory is different than wordpress".format(name))
exit(1) exit(1)
for i in range(0, len(wordpress)-1): for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
importWp.fromDirectory(directory[i]) importWp.fromDirectory(directory[i], name_thread, max_thread, revert)
del importWp del importWp
@@ -134,6 +148,8 @@ if __name__ == '__main__':
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="") parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="")
parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true")
parser.add_argument("--tmp", help="directory tmp", default="/tmp/import_export_canablog")
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
@@ -187,8 +203,7 @@ if __name__ == '__main__':
default="", default="",
help="File") help="File")
args = parser.parse_args() args = parser.parse_args()
logger = logging.getLogger('import export canalblog') logger = logging.getLogger('import export canalblog')
@@ -223,6 +238,8 @@ if __name__ == '__main__':
fileHandler.setFormatter(formatter) fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler) logger.addHandler(fileHandler)
os.makedirs(args.tmp, exist_ok=True)
if args.command == "import" or args.command == "remove": if args.command == "import" or args.command == "remove":
password = args.password password = args.password
if len(args.password) == 0: if len(args.password) == 0:
@@ -244,8 +261,9 @@ if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
errorRevert(logger, args.revert, args.tmp)
wait_for = [ wait_for = [
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image) ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, args.revert)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
@@ -255,8 +273,9 @@ if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
errorRevert(logger, args.revert, args.tmp)
wait_for = [ wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image) ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
@@ -294,7 +313,7 @@ if __name__ == '__main__':
try: try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog) ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, args.tmp)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
@@ -314,7 +333,8 @@ if __name__ == '__main__':
if len(args.directory) > 0: if len(args.directory) > 0:
try: try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(change, i, args.parallel, args, logger) for i in range(0, int(args.parallel)) ] errorRevert(logger, args.revert, args.tmp)
wait_for = [ ex.submit(change, i, args.parallel, args, logger, args.tmp, args.revert) for i in range(0, int(args.parallel)) ]
except Exception as err: except Exception as err:
logger.error("Thread error for remove : {0}".format(err)) logger.error("Thread error for remove : {0}".format(err))
if len(args.file) > 0: if len(args.file) > 0:

View File

@@ -4,11 +4,13 @@ import requests, os, logging, re, json
class WPChange: class WPChange:
# Constructor # Constructor
def __init__(self, index_name=1, number_thread=1, logger=None, parser="html.parser"): def __init__(self, index_name=1, number_thread=1, logger=None, parser="html.parser", tmp="/tmp/import_export_canablog"):
self._name = "Thread-{0}".format(index_name) self._name = "Thread-{0}".format(index_name)
self._logger = logger self._logger = logger
self._number_thread = number_thread self._number_thread = number_thread
self._parser = parser self._parser = parser
self._tmp = tmp
self._index_name = index_name
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -37,19 +39,62 @@ class WPChange:
## From directory ## From directory
def fromDirectory(self, directory="", number_thread=1, max_thread=1):
def fromDirectory(self, directory="", revert=False):
self._directory = directory
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
if len(directories) > 0: if len(directories) > 0:
files = self._getFiles(directories) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread) if revert is False:
self._tmpFiles(files=files, number_thread=self._index_name, max_thread=self._number_thread)
self._fromFileTmp()
else: else:
self._logger.error("{0} : No files for {1}".format(self._name, directory)) self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files=[]):
for i in range(0, len(files)):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, len(files), files[i]))
self._change(files[i])
# Private method # Private method
def _fromFileTmp(self):
try:
with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
files = json.loads(file.read())
self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(files)))
for i in range(0, len(files)):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, len(files), files[i]))
self._change(files[i])
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def _tmpFiles(self, files=[], number_thread=1, max_thread=1):
print()
divFiles = int(len(files) / int(max_thread))
currentRangeFiles = int(divFiles * (int(number_thread)+1))
firstRange = int(currentRangeFiles - divFiles)
self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
webpage = []
for i in range(firstRange, currentRangeFiles):
webpage.append(files[i])
try:
string_webpage = json.dumps(webpage)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(string_webpage)
except Exception as ex:
self._logger.error("{0} : Error for writing webpage : {1}".format(self._name, ex))
## Get all files ## Get all files
def _getFiles(self, item): def _getFiles(self, item):

View File

@@ -1,12 +1,12 @@
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
import requests, os, argparse, logging import requests, os, argparse, logging, json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
class WPExport: class WPExport:
def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True): def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True, tmp="/tmp/import_export_canablog"):
self._url = url self._url = url
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
@@ -22,7 +22,7 @@ class WPExport:
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries)) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
self._tmp = tmp
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -57,13 +57,13 @@ class WPExport:
# Download HTML # Download HTML
def downloadHTML(self, webpage): def downloadHTML(self, first, second):
self._downloadPage(webpage, self._dir) self._downloadPage(webpage[first][second], self._dir)
# Download Image # Download Image
def downloadImg(self, webpage): def downloadImg(self, first, second):
page_src = self._getImg(webpage) page_src = self._getImg(webpage[first][second])
o = urlparse(self._url) o = urlparse(self._url)
self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img")) self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img"))
@@ -161,7 +161,14 @@ class WPExport:
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error : {1}".format(self._name, err)) self._logger.error("{0} : Exception error : {1}".format(self._name, err))
exit(1) exit(1)
return webpage try:
string_webpage = json.dumps(webpage)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(string_webpage)
except Exception as ex:
self._logger.error("{0} : Error for writing webpage : {1}".format(self._name, ex))

View File

@@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"):
self._name = name self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
@@ -28,6 +28,7 @@ class WPimport:
self._no_create = no_create self._no_create = no_create
self._no_update = no_update self._no_update = no_update
self._no_image = no_image self._no_image = no_image
self._tmp = tmp
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -38,51 +39,58 @@ class WPimport:
def setUrl(self, wordpress): def setUrl(self, wordpress):
self._wordpress = wordpress self._wordpress = wordpress
def fromUrl(self, webpage): def fromUrl(self, first, second):
for i in range(0, len(webpage)): try:
try: with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
r = self._request.get(webpage[i]) webpage_content = json.loads(file.read())
if r.status_code == 200: self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content)))
self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) webpage = webpage_content[first][second]
soup = BeautifulSoup(r.content, self._parser) for i in range(0, len(webpage)):
articlebody = soup.find_all("div", class_="articlebody") try:
if len(articlebody) > 0: r = self._request.get(webpage[i])
self._addOrUpdatePost(soup) if r.status_code == 200:
else: self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
self._addOrUpdateFeaturedMedia(soup) soup = BeautifulSoup(r.content, self._parser)
else: articlebody = soup.find_all("div", class_="articlebody")
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) if len(articlebody) > 0:
self._logger.debug("{0} : {1}".format(self._name, r.content)) self._addOrUpdatePost(soup)
except ConnectionError as err: else:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) self._addOrUpdateFeaturedMedia(soup)
exit(1) del webpage_content[first][second][i]
except Exception as err: webpage_content = json.dumps(webpage_content)
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
else:
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except IOError as err:
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def fromDirectory(self, directory="", number_thread=1, max_thread=1): def fromDirectory(self, directory="", number_thread=1, max_thread=1, revert=False):
self._directory = directory self._directory = directory
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
if len(directories) > 0: if len(directories) > 0:
files = self._getFiles(directories) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread) if revert is False:
self._tmpFiles(files=files, number_thread=number_thread, max_thread=max_thread)
self._fromFileTmp()
else: else:
self._logger.error("{0} : No files for {1}".format(self._name, directory)) self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files=[], number_thread=1, max_thread=1): def fromFile(self, files=[]):
divFiles = int(len(files) / max_thread) for i in range(0, len(files)):
currentRangeFiles = int(divFiles * (number_thread+1))
firstRange = int(currentRangeFiles - divFiles)
self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
for i in range(firstRange, currentRangeFiles):
if os.path.exists(files[i]): if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i])) self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, len(files), files[i]))
with open(files[i], 'r') as f: with open(files[i], 'r') as f:
content = f.read() content = f.read()
self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content))) self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content)))
@@ -93,10 +101,52 @@ class WPimport:
self._addOrUpdatePost(soup) self._addOrUpdatePost(soup)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
# Private method # Private method
def _fromFileTmp(self):
try:
with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
files = json.loads(file.read())
self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(files)))
for i in range(0, len(files)):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, len(files), files[i]))
with open(files[i], 'r') as f:
content = f.read()
self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content)))
soup = BeautifulSoup(content, self._parser)
articlebody = soup.find_all("div", class_="articlebody")
self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody)))
if len(articlebody) > 0:
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def _tmpFiles(self, files=[], number_thread=1, max_thread=1):
divFiles = int(len(files) / max_thread)
currentRangeFiles = int(divFiles * (number_thread+1))
firstRange = int(currentRangeFiles - divFiles)
self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
webpage = []
for i in range(firstRange, currentRangeFiles):
webpage.append(files[i])
try:
string_webpage = json.dumps(webpage)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(string_webpage)
except Exception as ex:
self._logger.error("{0} : Error for writing webpage : {1}".format(self._name, ex))
## replace caracter ## replace caracter
def _replaceCaracter(self, title_rendered): def _replaceCaracter(self, title_rendered):