Compare commits

...

3 Commits

Author SHA1 Message Date
a67ff868f3 fix json read file 2023-06-26 23:52:03 +02:00
8e0abc40bd check files tmp 2023-06-26 23:09:54 +02:00
9149a6c5cb rollback webpage 2023-06-26 22:44:42 +02:00
3 changed files with 51 additions and 51 deletions

View File

@ -54,7 +54,6 @@ def download(name_thread, max_thread, url, logger, parser, directory, html, img,
del exportWp del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp): def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
canalblog = canalblog.split(",") canalblog = canalblog.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
@ -107,23 +106,23 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
del importWp del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, tmp, revert): def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image):
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",") directory = directory.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
if serial is False: if serial is False:
for i in wordpress: for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, revert=revert) importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
for j in directory: for j in directory:
importWp.fromDirectory(j, name_thread, max_thread) importWp.fromDirectory(j, name_thread, max_thread)
del importWp del importWp
else: else:
if len(directory) != len(wordpress): if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name)) logger.error("{0} : Error : Number directory is different than wordpress".format(name))
exit(1) exit(1)
for i in range(0, len(wordpress)-1): for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, revert=revert) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
importWp.fromDirectory(directory[i]) importWp.fromDirectory(directory[i])
del importWp del importWp
@ -252,7 +251,7 @@ if __name__ == '__main__':
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
wait_for = [ wait_for = [
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, TMP, revert) ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
@ -262,6 +261,13 @@ if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
if args.revert is True:
files_tmp = glob.glob("{0}/*.json".format(tmp))
if len(files_tmp) > 0:
if len(files_tmp) != args.parallel:
for file_r in files_tmp:
os.remove(file_r)
wait_for = [ wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, TMP) ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, TMP)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))

View File

@ -1,7 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
import requests, os, argparse, logging import requests, os, argparse, logging, json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry

View File

@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", revert=False): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"):
self._name = name self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
@ -29,7 +29,6 @@ class WPimport:
self._no_update = no_update self._no_update = no_update
self._no_image = no_image self._no_image = no_image
self._tmp = tmp self._tmp = tmp
self._revert = revert
# Destructor # Destructor
def __del__(self): def __del__(self):
@ -42,55 +41,50 @@ class WPimport:
def fromUrl(self, first, second): def fromUrl(self, first, second):
try: try:
content_file = open("{0}/{1}.json".format(self._name, self._tmp)) with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
webpage_content = json.loads(content_file) webpage_content = json.loads(file.read())
webpage = webpage_content[first][second] self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content)))
for i in range(0, len(webpage)): webpage = webpage_content[first][second]
try: for i in range(0, len(webpage)):
r = self._request.get(webpage[i]) try:
if r.status_code == 200: r = self._request.get(webpage[i])
self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) if r.status_code == 200:
soup = BeautifulSoup(r.content, self._parser) self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
articlebody = soup.find_all("div", class_="articlebody") soup = BeautifulSoup(r.content, self._parser)
if len(articlebody) > 0: articlebody = soup.find_all("div", class_="articlebody")
self._addOrUpdatePost(soup) if len(articlebody) > 0:
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
del webpage_content[first][second][i]
webpage_content = json.dumps(webpage_content)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
del webpage_content[first][second][i] self._logger.debug("{0} : {1}".format(self._name, r.content))
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content) except ConnectionError as err:
else: self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) exit(1)
self._logger.debug("{0} : {1}".format(self._name, r.content)) except IOError as err:
except ConnectionError as err: self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) exit(1)
exit(1) except Exception as err:
except IOError as err: self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as ex: except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex)) self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def fromDirectory(self, directory="", number_thread=1, max_thread=1): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
if self._revert: self._directory = directory
self._directory = directory directory = "{0}/archives".format(directory)
directory = "{0}/archives".format(directory) directories = self._getDirectories([], "{0}".format(directory))
directories = self._getDirectories([], "{0}".format(directory)) if len(directories) > 0:
if len(directories) > 0: files = self._getFiles(directories)
files = self._getFiles(directories) self.fromFile(files=files, number_thread=number_thread, max_thread=max_thread)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
else: else:
try: self._logger.error("{0} : No files for {1}".format(self._name, directory))
files = open("{0}/{1}.json".format(self._name, self._tmp))
self.fromFile(files, number_thread, max_thread)
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))