import threading for directory WIP

This commit is contained in:
Valentin CZERYBA 2023-04-27 00:00:53 +02:00
parent a0b816fe18
commit bf4c2480f8
2 changed files with 65 additions and 36 deletions

View File

@ -10,7 +10,6 @@ from lib.WPExport import WPExport
def download(name_thread, max_thread, url, logger, parser, directory, html, img): def download(name_thread, max_thread, url, logger, parser, directory, html, img):
#def download(args):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
@ -21,6 +20,46 @@ def download(name_thread, max_thread, url, logger, parser, directory, html, img)
if args.img is False: if args.img is False:
exportWp.downloadImg(webpage) exportWp.downloadImg(webpage)
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromUrl(webpage)
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic):
name = "Thread-{0}".format(int(name_thread) + 1)
importWp = WPimport(name=name, basic=basic, wordpress=wordpress, logger=logger, parser=parser)
@ -112,40 +151,14 @@ if __name__ == '__main__':
importWp.fromDirectory(directory[i]) importWp.fromDirectory(directory[i])
exit(0) exit(0)
if len(args.canalblog) > 0: if len(args.canalblog) > 0:
exportWp = WPExport("", logger, args.parser, args.directory)
canalblog = args.canalblog.split(",")
wordpress = args.wordpress.split(",")
if args.serial is False:
for canal in canalblog:
try: try:
o = urlparse(canal) with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
o = o._replace(scheme="https") wait_for = [
url = o.geturl().replace(":///", "://") ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial)
for i in range(0, int(args.parallel))
]
except Exception as err: except Exception as err:
logger.error("parsing error : {0}".format(err)) logger.error("Threading error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
for j in wordpress:
importWp.setUrl(j)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress")
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
importWp.setUrl(wordpress[i])
importWp.fromUrl(webpage)

View File

@ -8,7 +8,8 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, basic, wordpress, logger, parser): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser"):
self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
self._logger = logger self._logger = logger
@ -69,6 +70,21 @@ class WPimport:
# Private method # Private method
## From files in split by thread
def fromFile(self, files):
for i in range(0, len(files)):
if os.path.exists(file):
self._logger.info("Fichier en cours de traitement : {0}".format(files[i]))
with open(file, 'r') as f:
content = f.read()
soup = BeautifulSoup(content, self._parser)
articlebody = soup.find_all("div", class_="articlebody")
if len(articlebody) > 0:
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
## Get all files ## Get all files
def _getFiles(self, item): def _getFiles(self, item):