webpage-file #20

Merged
v4l3n71n merged 8 commits from webpage-file into master 2023-06-26 22:28:28 +00:00
2 changed files with 29 additions and 29 deletions
Showing only changes of commit a67ff868f3 - Show all commits

View File

@ -1,7 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
import requests, os, argparse, logging import requests, os, argparse, logging, json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry

View File

@ -41,35 +41,35 @@ class WPimport:
def fromUrl(self, first, second): def fromUrl(self, first, second):
try: try:
content_file = open("{0}/{1}.json".format(self._name, self._tmp)) with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
webpage_content = json.loads(content_file) webpage_content = json.loads(file.read())
webpage = webpage_content[first][second] self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content)))
for i in range(0, len(webpage)): webpage = webpage_content[first][second]
try: for i in range(0, len(webpage)):
r = self._request.get(webpage[i]) try:
if r.status_code == 200: r = self._request.get(webpage[i])
self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) if r.status_code == 200:
soup = BeautifulSoup(r.content, self._parser) self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
articlebody = soup.find_all("div", class_="articlebody") soup = BeautifulSoup(r.content, self._parser)
if len(articlebody) > 0: articlebody = soup.find_all("div", class_="articlebody")
self._addOrUpdatePost(soup) if len(articlebody) > 0:
self._addOrUpdatePost(soup)
else:
self._addOrUpdateFeaturedMedia(soup)
del webpage_content[first][second][i]
webpage_content = json.dumps(webpage_content)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
del webpage_content[first][second][i] self._logger.debug("{0} : {1}".format(self._name, r.content))
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content) except ConnectionError as err:
else: self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) exit(1)
self._logger.debug("{0} : {1}".format(self._name, r.content)) except IOError as err:
except ConnectionError as err: self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) exit(1)
exit(1) except Exception as err:
except IOError as err: self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as ex: except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex)) self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))