diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 78fdb0f..ce2068e 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -170,7 +170,7 @@ if __name__ == '__main__': import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true") import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true") import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true") - import_parser.add_argument("--author-album", dest=author, help="Define author for page album", default="") + import_parser.add_argument("--author", dest="author", help="Define author", default="") @@ -259,7 +259,7 @@ if __name__ == '__main__': basic = HTTPBasicAuth(args.user, password) if args.command == "import": wordpress = args.wordpress.split(",") - importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author) + importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author, ssl_canalblog=ssl_canalblog) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) diff --git a/lib/WPImport.py b/lib/WPImport.py index e28cf1a..d0240a9 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor - def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author=""): + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author="", ssl_canalblog=True): self._name = name self._basic = basic self._wordpress = wordpress @@ -20,7 +20,7 @@ class WPimport: if ssl_wordpress is False: self._protocol = "http" self._request = requests.Session() - + self._ssl_canalblog = ssl_canalblog retries = Retry(connect=10, read=10, redirect=5, status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) @@ -139,6 +139,40 @@ class WPimport: self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err)) return author + def _getInfoAlbum(self, link): + if self._ssl_canalblog: + link = link.replace("http", "https") + self._logger.info("{0} : Info album : {1}".format(self._name, link)) + link_o = urlparse(link) + if len(link_o.netloc) > 0: + self._logger.info("{0} : get album info from web : {1}".format(self._name, link_o)) + try: + response = self._request.get(link) + if response.status_code == 200: + self._logger.info("{0} : get content info from web : {1}".format(self._name, link)) + page_img = response.content + except ConnectionError as err: + self._logger.error("{0} : Connection error for get album info : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for get album info : {1}".format(self._name, err)) + exit(1) + else: + self._logger.info("{0} : get album info from file : {1}".format(self._name, link_o)) + if os.path.exists("{0}/..{1}".format(self._directory, link_o)): + page_img = open("{0}/..{1}".format(self._directory, link_o), "r") + soup = BeautifulSoup(page_img, self._parser) + paragraph = soup.find("div", class_="albumbody").find("p") + self._logger.info("{0} get paragraph : {1}".format(self._name, paragraph)) + split_paragraph = str(paragraph).split("
") + self._logger.info("{0} length paragraph splitted : {1}".format(self._name, len(split_paragraph))) + if len(split_paragraph) == 1: + split_paragraph = str(paragraph).split("
") + self._logger.info("{0} get paragraph splitted : {1}".format(self._name, split_paragraph)) + author = split_paragraph[1].split(":")[1].replace(" ", "").lower() + return author + + def _addOrUpdateAlbum(self, soup): self._logger.info("{0} : Add/Update Album".format(self._name)) albumbody = soup.find("div", class_="albumbody") @@ -195,8 +229,19 @@ class WPimport: data = {"title":albumtitle, "content":content_html, "status":"publish"} if len(self._author) > 0: author = self._getAuthor(self._author) - if author != 0: - data = {"title":albumtitle, "content":content_html, "status":"publish", "author":author} + else: + link_a = albumbody.find_all("a") + for i in link_a: + if re.search(r"/albums/", i.get("href", "/")): + href_a = i.get("href", "/") + break + author = self._getInfoAlbum(href_a) + self._logger.info("{0} : author : {1}".format(self._name, author)) + author = self._getAuthor(author) + data = {"title":albumtitle, "content":content_html, "status":"publish"} + + if author != 0: + data = {"title":albumtitle, "content":content_html, "status":"publish", "author":author} self._logger.debug("{0} : data for album page : {1}".format(self._name, data)) for index in range(1,10): params = {"search": albumtitle, "per_page":100, "page": index} @@ -806,7 +851,9 @@ class WPimport: self._logger.error("{0} : Exception error for post {1} : {2}".format(self._name, i, err)) title = articletitle[0].text - author = articleacreator[0].text.lower() + author = articleacreator[0].text.lower() + if len(self._author) > 0: + author = self._author body = articlebody[0].find_all("p") bodyhtml = "

" for i in body: