From 3e7689267691f5178accda628632162caa6aca51 Mon Sep 17 00:00:00 2001 From: Valentin CZERYBA Date: Mon, 5 Jun 2023 23:46:57 +0200 Subject: [PATCH] add wpchange --- lib/WPChange.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 lib/WPChange.py diff --git a/lib/WPChange.py b/lib/WPChange.py new file mode 100644 index 0000000..2cf58e5 --- /dev/null +++ b/lib/WPChange.py @@ -0,0 +1,78 @@ +from bs4 import BeautifulSoup +from urllib.parse import urlparse +import requests, os, logging, re, json + +class WPRemove: + # Constructor + def __init__(self, index_name=1, number_thread=1, logger=None): + self._name = "Thread-{0}".format(index_name) + self._logger = logger + self._number_thread = number_thread + + # Destructor + def __del__(self): + print("{0} : Import finished".format(self._name)) + + + # Public method + + ## from file + + def fromFile(self, files=[], number_thread=1, max_thread=1): + divFiles = int(len(files) / max_thread) + currentRangeFiles = int(divFiles * (number_thread+1)) + firstRange = int(currentRangeFiles - divFiles) + self._logger.debug("{0} : index : {1}".format(self._name,number_thread)) + + self._logger.debug("{0} : first range : {1}".format(self._name,firstRange)) + self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles)) + + for i in range(firstRange, currentRangeFiles): + if os.path.exists(files[i]): + self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i])) + with open(files[i], 'r') as f: + content = f.read() + self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content))) + soup = BeautifulSoup(content, self._parser) + articlebody = soup.find_all("div", class_="articlebody") + self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody))) + if len(articlebody) > 0: + self._addOrUpdatePost(soup) + else: + self._addOrUpdateFeaturedMedia(soup) + + + ## From directory + + def fromDirectory(self, directory="", number_thread=1, max_thread=1): + directory = "{0}/archives".format(directory) + directories = self._getDirectories([], "{0}".format(directory)) + if len(directories) > 0: + files = self._getFiles(directories) + self.fromFile(files, number_thread, max_thread) + else: + self._logger.error("{0} : No files for {1}".format(self._name, directory)) + + + # Private method + + ## Get all files + + def _getFiles(self, item): + files = [] + for i in item: + for j in os.listdir(i): + if os.path.isfile("{0}/{1}".format(i, j)): + files.append("{0}/{1}".format(i, j)) + return files + + + ## Get directories + + def _getDirectories(self, subdirectory, item): + sub = subdirectory + for i in os.listdir(item): + if os.path.isdir("{0}/{1}".format(item, i)): + sub.append("{0}/{1}".format(item, i)) + subdirectory = self._getDirectories(sub, "{0}/{1}".format(item, i)) + return subdirectory