diff --git a/lib/WPExport.py b/lib/WPExport.py index df44100..53dac23 100644 --- a/lib/WPExport.py +++ b/lib/WPExport.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 from bs4 import BeautifulSoup from urllib.parse import urlparse -import requests, os, argparse, logging +import requests, os, argparse, logging, json from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry diff --git a/lib/WPImport.py b/lib/WPImport.py index 65453da..432d50d 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -41,35 +41,35 @@ class WPimport: def fromUrl(self, first, second): try: - content_file = open("{0}/{1}.json".format(self._name, self._tmp)) - webpage_content = json.loads(content_file) - webpage = webpage_content[first][second] - for i in range(0, len(webpage)): - try: - r = self._request.get(webpage[i]) - if r.status_code == 200: - self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) - soup = BeautifulSoup(r.content, self._parser) - articlebody = soup.find_all("div", class_="articlebody") - if len(articlebody) > 0: - self._addOrUpdatePost(soup) + with open("{0}/{1}.json".format(self._tmp, self._name)) as file: + webpage_content = json.loads(file.read()) + self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content))) + webpage = webpage_content[first][second] + for i in range(0, len(webpage)): + try: + r = self._request.get(webpage[i]) + if r.status_code == 200: + self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) + soup = BeautifulSoup(r.content, self._parser) + articlebody = soup.find_all("div", class_="articlebody") + if len(articlebody) > 0: + self._addOrUpdatePost(soup) + else: + self._addOrUpdateFeaturedMedia(soup) + del webpage_content[first][second][i] + webpage_content = json.dumps(webpage_content) + open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content) else: - self._addOrUpdateFeaturedMedia(soup) - del webpage_content[first][second][i] - open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content) - else: - self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) - self._logger.debug("{0} : {1}".format(self._name, r.content)) - except ConnectionError as err: - self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) - exit(1) - except IOError as err: - self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err)) - exit(1) - except Exception as err: - self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) - - + self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) + exit(1) + except IOError as err: + self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) except Exception as ex: self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))