Compare commits

..

No commits in common. "a67ff868f3e64b2692365acbe7d7068e42e8e53b" and "d1b6e8048acabf5fe8797807d4d78ef8ce09b6c1" have entirely different histories.

3 changed files with 51 additions and 51 deletions

View File

@ -54,6 +54,7 @@ def download(name_thread, max_thread, url, logger, parser, directory, html, img,
del exportWp del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp): def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
canalblog = canalblog.split(",") canalblog = canalblog.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
@ -106,23 +107,23 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
del importWp del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image): def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, tmp, revert):
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",") directory = directory.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
if serial is False: if serial is False:
for i in wordpress: for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, revert=revert)
for j in directory: for j in directory:
importWp.fromDirectory(j, name_thread, max_thread) importWp.fromDirectory(j, name_thread, max_thread)
del importWp del importWp
else: else:
if len(directory) != len(wordpress): if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is different than wordpress".format(name)) logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1) exit(1)
for i in range(0, len(wordpress)-1): for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, revert=revert)
importWp.fromDirectory(directory[i]) importWp.fromDirectory(directory[i])
del importWp del importWp
@ -251,7 +252,7 @@ if __name__ == '__main__':
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
wait_for = [ wait_for = [
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image) ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, TMP, revert)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
@ -261,13 +262,6 @@ if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
if args.revert is True:
files_tmp = glob.glob("{0}/*.json".format(tmp))
if len(files_tmp) > 0:
if len(files_tmp) != args.parallel:
for file_r in files_tmp:
os.remove(file_r)
wait_for = [ wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, TMP) ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, TMP)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))

View File

@ -1,7 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
import requests, os, argparse, logging, json import requests, os, argparse, logging
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry

View File

@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", revert=False):
self._name = name self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
@ -29,6 +29,7 @@ class WPimport:
self._no_update = no_update self._no_update = no_update
self._no_image = no_image self._no_image = no_image
self._tmp = tmp self._tmp = tmp
self._revert = revert
# Destructor # Destructor
def __del__(self): def __del__(self):
@ -41,50 +42,55 @@ class WPimport:
def fromUrl(self, first, second): def fromUrl(self, first, second):
try: try:
with open("{0}/{1}.json".format(self._tmp, self._name)) as file: content_file = open("{0}/{1}.json".format(self._name, self._tmp))
webpage_content = json.loads(file.read()) webpage_content = json.loads(content_file)
self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content))) webpage = webpage_content[first][second]
webpage = webpage_content[first][second] for i in range(0, len(webpage)):
for i in range(0, len(webpage)): try:
try: r = self._request.get(webpage[i])
r = self._request.get(webpage[i]) if r.status_code == 200:
if r.status_code == 200: self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) soup = BeautifulSoup(r.content, self._parser)
soup = BeautifulSoup(r.content, self._parser) articlebody = soup.find_all("div", class_="articlebody")
articlebody = soup.find_all("div", class_="articlebody") if len(articlebody) > 0:
if len(articlebody) > 0: self._addOrUpdatePost(soup)
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
del webpage_content[first][second][i]
webpage_content = json.dumps(webpage_content)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
else: else:
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) self._addOrUpdateFeaturedMedia(soup)
self._logger.debug("{0} : {1}".format(self._name, r.content)) del webpage_content[first][second][i]
except ConnectionError as err: open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) else:
exit(1) self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
except IOError as err: self._logger.debug("{0} : {1}".format(self._name, r.content))
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err)) except ConnectionError as err:
exit(1) self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as err: exit(1)
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) except IOError as err:
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as ex: except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex)) self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def fromDirectory(self, directory="", number_thread=1, max_thread=1): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
self._directory = directory if self._revert:
directory = "{0}/archives".format(directory) self._directory = directory
directories = self._getDirectories([], "{0}".format(directory)) directory = "{0}/archives".format(directory)
if len(directories) > 0: directories = self._getDirectories([], "{0}".format(directory))
files = self._getFiles(directories) if len(directories) > 0:
self.fromFile(files=files, number_thread=number_thread, max_thread=max_thread) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
else: else:
self._logger.error("{0} : No files for {1}".format(self._name, directory)) try:
files = open("{0}/{1}.json".format(self._name, self._tmp))
self.fromFile(files, number_thread, max_thread)
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))