#!/usr/bin/python3 from bs4 import BeautifulSoup from urllib.parse import urlparse import requests, os, logging, re, json from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"): self._name = name self._basic = basic self._wordpress = wordpress self._logger = logger self._parser = parser self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'} self._protocol = "https" self._directory = "backup" if ssl_wordpress is False: self._protocol = "http" self._request = requests.Session() retries = Retry(connect=10, read=10, redirect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries)) self._no_create = no_create self._no_update = no_update self._no_image = no_image self._tmp = tmp # Destructor def __del__(self): print("{0} : Import finished for {1}".format(self._name, self._wordpress)) # Public method def setUrl(self, wordpress): self._wordpress = wordpress def fromUrl(self, first, second): try: with open("{0}/{1}.json".format(self._tmp, self._name)) as file: webpage_content = json.loads(file.read()) self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content))) webpage = webpage_content[first][second] for i in range(0, len(webpage)): try: r = self._request.get(webpage[i]) if r.status_code == 200: self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) soup = BeautifulSoup(r.content, self._parser) articlebody = soup.find_all("div", class_="articlebody") if len(articlebody) > 0: self._addOrUpdatePost(soup) else: albumbody = soup.find_all("div", class_="albumbody") if len(albumbody) > 0: self._addOrUpdateAlbum(soup) else: self._addOrUpdateFeaturedMedia(soup) del webpage_content[first][second][i] webpage_content = json.dumps(webpage_content) open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content) else: self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) self._logger.debug("{0} : {1}".format(self._name, r.content)) except ConnectionError as err: self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) exit(1) except IOError as err: self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err)) exit(1) except Exception as err: self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) except Exception as ex: self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex)) def fromDirectory(self, directory="", number_thread=1, max_thread=1, revert=False): self._directory = directory directory = "{0}/archives".format(directory) directories = self._getDirectories([], "{0}".format(directory)) if len(directories) > 0: files = self._getFiles(directories) if revert is False: self._tmpFiles(files=files, number_thread=number_thread, max_thread=max_thread) self._fromFileTmp() else: self._logger.error("{0} : No files for {1}".format(self._name, directory)) def fromFile(self, files=[]): for i in range(0, len(files)): if os.path.exists(files[i]): self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, len(files), files[i])) with open(files[i], 'r') as f: content = f.read() self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content))) soup = BeautifulSoup(content, self._parser) articlebody = soup.find_all("div", class_="articlebody") self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody))) if len(articlebody) > 0: self._addOrUpdatePost(soup) else: albumbody = soup.find_all("div", class_="albumbody") if len(albumbody) > 0: self._addOrUpdateAlbum(soup) else: self._addOrUpdateFeaturedMedia(soup) # Private method def _addOrUpdateAlbum(self, soup): self._logger.info("{0} : Add/Update Album".format(self._name)) albumbody = soup.find("div", class_="albumbody") albumtitle = albumbody.find("h2").get_text() self._logger.debug("{0} : Title of the album : {1}".format(self._name, albumtitle)) albumdesc = albumbody.find("div", class_="albumdesc").find("p") img_a = albumbody.find_all("img") list_img = [] if self._no_image is False: self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img_a))) for i in img_a: new_img = {} href_img = i.get("src") href_img_o = urlparse(href_img) new_img["old_src"]=href_img try: if len(href_img_o.netloc) > 0: img_ok = False page_img = self._request.get(href_img) if page_img.status_code == 200: img_ok = True else: if os.path.exists("{0}/..{1}".format(self._directory, href_img)): page_img = open("{0}/..{1}".format(self._directory, href_img), "r") img_ok = True self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code)) if img_ok is True: media=self._addOrUpdateMedia(href_img, page_img) new_img["id"]=media["id"] new_img["new_src"]=media["rendered"] list_img.append(new_img) except ConnectionError as err: self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) exit(1) except Exception as err: self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err)) exit(1) self._logger.debug("{0} content img : {1}".format(self._name, list_img)) content_html = "" if len(list_img) > 0: content_html = "

{0}

\n\n".format(albumdesc) content_html = content_html + "