diff --git a/.gitignore b/.gitignore index f82fde7..c986d66 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ backup*/ wp-navigation *.log __pycache__/ +wp-gallery \ No newline at end of file diff --git a/import_export_canalblog.py b/import_export_canalblog.py index 4948b94..78fdb0f 100644 --- a/import_export_canalblog.py +++ b/import_export_canalblog.py @@ -67,7 +67,7 @@ def download(name_thread, max_thread, url, logger, parser, directory, html, img, del exportWp -def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp): +def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp, author): canalblog = canalblog.split(",") wordpress = wordpress.split(",") name = "Thread-{0}".format(int(name_thread) + 1) @@ -88,7 +88,7 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas exportWp.getUrlPage(name_thread, max_thread) del exportWp for j in wordpress: - importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp) + importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, author=author) for k in ["article", "page"]: for l in ["publications", "principal"]: importWp.fromUrl(l, k) @@ -110,7 +110,7 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas if not revert: exportWp.getUrlPage(name_thread, max_thread) del exportWp - importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp) + importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp, author=author) for k in ["article", "page"]: for l in ["publications", "principal"]: @@ -119,13 +119,13 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas del importWp -def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, revert): +def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image, revert, author): name = "Thread-{0}".format(int(name_thread) + 1) directory = directory.split(",") wordpress = wordpress.split(",") if serial is False: for i in wordpress: - importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) + importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, author=author) for j in directory: importWp.fromDirectory(j, name_thread, max_thread, revert) del importWp @@ -135,7 +135,7 @@ def importDirectory(name_thread, max_thread, directory, logger, parser, wordpres logger.error("{0} : Error : Number directory is different than wordpress".format(name)) exit(1) for i in range(0, len(wordpress)-1): - importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) + importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, author=author) importWp.fromDirectory(directory[i], name_thread, max_thread, revert) del importWp @@ -170,6 +170,7 @@ if __name__ == '__main__': import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true") import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true") import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true") + import_parser.add_argument("--author-album", dest=author, help="Define author for page album", default="") @@ -258,13 +259,13 @@ if __name__ == '__main__': basic = HTTPBasicAuth(args.user, password) if args.command == "import": wordpress = args.wordpress.split(",") - importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress) + importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress, author=args.author) if len(args.file) > 0: for i in wordpress: importWp.setUrl(i) importWp.fromFile(files=args.file.split(",")) menuWp = WPMenu(name="Thread-1", basic=basic, wordpress=args.wordpress, logger=logger, parser=args.parser, ssl_canalblog=ssl_canalblog, ssl_wordpress=ssl_wordpress) - menuWp.fromFile("{0}/index.html".format(args.file.split(",")[0])) + menuWp.fromFile("{0}".format(args.file.split(",")[0])) if len(args.directory) > 0: try: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: @@ -272,7 +273,7 @@ if __name__ == '__main__': wait(wait_for, return_when=ALL_COMPLETED) errorRevert(logger, args.revert, args.tmp) wait_for = [ - ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, args.revert) + ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image, args.revert, args.author) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) @@ -287,7 +288,7 @@ if __name__ == '__main__': wait(wait_for, return_when=ALL_COMPLETED) errorRevert(logger, args.revert, args.tmp) wait_for = [ - ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp) + ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp, args.author) for i in range(0, int(args.parallel)) ] wait(wait_for, return_when=ALL_COMPLETED) diff --git a/lib/WPImport.py b/lib/WPImport.py index 9f0767a..e28cf1a 100644 --- a/lib/WPImport.py +++ b/lib/WPImport.py @@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry class WPimport: # Constructor - def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"): + def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog", author=""): self._name = name self._basic = basic self._wordpress = wordpress @@ -29,6 +29,7 @@ class WPimport: self._no_update = no_update self._no_image = no_image self._tmp = tmp + self._author = author # Destructor def __del__(self): @@ -55,7 +56,11 @@ class WPimport: if len(articlebody) > 0: self._addOrUpdatePost(soup) else: - self._addOrUpdateFeaturedMedia(soup) + albumbody = soup.find_all("div", class_="albumbody") + if len(albumbody) > 0: + self._addOrUpdateAlbum(soup) + else: + self._addOrUpdateFeaturedMedia(soup) del webpage_content[first][second][i] webpage_content = json.dumps(webpage_content) open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content) @@ -100,12 +105,192 @@ class WPimport: if len(articlebody) > 0: self._addOrUpdatePost(soup) else: - self._addOrUpdateFeaturedMedia(soup) + albumbody = soup.find_all("div", class_="albumbody") + if len(albumbody) > 0: + self._addOrUpdateAlbum(soup) + else: + self._addOrUpdateFeaturedMedia(soup) # Private method + + def _getAuthor(self, author): + params = {"search":author, "per_page":100} + author = 0 + try: + self._logger.info("{0} : Search author : {1}".format(self._name, author)) + page = self._request.get("{1}://{0}/wp-json/wp/v2/users".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, params=params) + self._logger.debug("{0} : End Search author : {1}".format(self._name, author)) + self._logger.debug("{0} : Debug requests : {1}".format(self._name, page.content)) + if page.status_code == 200: + self._logger.info("{0} : Get author : {1}".format(self._name, author)) + result = page.json() + for a in result: + author = a["id"] + else: + self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(page.content)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err)) + return author + + def _addOrUpdateAlbum(self, soup): + self._logger.info("{0} : Add/Update Album".format(self._name)) + albumbody = soup.find("div", class_="albumbody") + albumtitle = albumbody.find("h2").get_text() + self._logger.debug("{0} : Title of the album : {1}".format(self._name, albumtitle)) + albumdesc = albumbody.find("div", class_="albumdesc").find("p") + img_a = albumbody.find_all("img") + list_img = [] + page_is_exist = False + + if self._no_image is False: + self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img_a))) + + for i in img_a: + new_img = {} + href_img = i.get("src") + href_img_o = urlparse(href_img) + try: + if len(href_img_o.netloc) > 0: + img_ok = False + href_img = href_img.replace("_q", "_o") + page_img = self._request.get(href_img) + if page_img.status_code == 200: + img_ok = True + else: + if os.path.exists("{0}/..{1}".format(self._directory, href_img)): + page_img = open("{0}/..{1}".format(self._directory, href_img), "r") + img_ok = True + self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code)) + if img_ok is True: + media=self._addOrUpdateMedia(href_img, page_img) + new_img["id"]=media["id"] + new_img["new_src"]=media["rendered"] + list_img.append(new_img) + except ConnectionError as err: + self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err)) + exit(1) + self._logger.debug("{0} content img : {1}".format(self._name, list_img)) + content_html = "" + if len(list_img) > 0: + + content_html = "\n{0}\n\n\n".format(albumdesc) + + + + for i in range(0, len(list_img)): + content_html = content_html + "
\"\"
\n\n".replace("id-image", str(list_img[i]["id"])).replace("src-image", list_img[i]["new_src"]) + + self._logger.info("{0} : content html : {1}".format(self._name, content_html)) + if len(content_html) > 0: + data = {"title":albumtitle, "content":content_html, "status":"publish"} + if len(self._author) > 0: + author = self._getAuthor(self._author) + if author != 0: + data = {"title":albumtitle, "content":content_html, "status":"publish", "author":author} + self._logger.debug("{0} : data for album page : {1}".format(self._name, data)) + for index in range(1,10): + params = {"search": albumtitle, "per_page":100, "page": index} + try: + self._logger.info("{0} : Search post with index {2} : {1}".format(self._name, albumtitle, index)) + page = self._request.get("{1}://{0}/wp-json/wp/v2/pages".format(self._wordpress, self._protocol), auth=self._basic, params=params, headers=self._headers_json) + if page.status_code == 200: + self._logger.debug("{0} : Encoding : {1}".format(self._name, page.encoding)) + page.encoding = "utf-8" + result = page.json() + if len(result) == 0: + break + self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result))) + count = 0 + for i in result: + title_rendered = i["title"]["rendered"] + self._logger.info("{0} : Search title pages for |{2}| : |{1}|".format(self._name, title_rendered, albumtitle)) + if len(title_rendered) != len(albumtitle): + title_rendered = self._replaceCaracter(title_rendered) + self._logger.debug("{0} : Search title pages for |{2}| : |{1}|".format(self._name, title_rendered, albumtitle)) + self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(albumtitle), len(title_rendered))) + if title_rendered == albumtitle: + if self._no_update is False: + page_is_exist = True + post_id = i["id"] + count = count + 1 + if count > 1: + self._logger.info("{0} : Page {1} is double and going to delete".format(self._name, albumtitle)) + try: + params = {"force":1} + page = self._request.delete("{2}://{0}/wp-json/wp/v2/pages/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, params=params) + if page.status_code == 200: + self._logger.info("{0} : Page deleted : {1}".format(self._name, albumtitle)) + else: + self._logger.error("{0} : Page not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for deleted page : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for deleted page : {1}".format(self._name, err)) + + else: + self._logger.debug("{0} : Data for page to update : {1}".format(self._name, i)) + self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, albumtitle)) + + try: + page = self._request.post("{2}://{0}/wp-json/wp/v2/pages/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) + + if page.status_code == 200: + result = page.json() + self._logger.info("{0} : page updated : {1}".format(self._name, albumtitle)) + else: + self._logger.error("{0} : page not updated due status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for update page : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for update page : {1}".format(self._name, err)) + if page.status_code == 400: + self._logger.error("{0} : Connection for update post unauthorized : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + break + else: + self._logger.error("{0} : Connection for update page error with status code : {1}".format(self._name, page.status_code)) + self._logger.debug("{0} : {1}".format(self._name, page.content)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for search page : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for search page : {1}".format(self._name, err)) + + if page_is_exist is False and self._no_create is False: + try: + self._logger.info("{0} : Creating page : {1}".format(self._name, data["title"])) + page = self._request.post("{1}://{0}/wp-json/wp/v2/pages".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) + + if page.status_code == 201: + result = page.json() + self._logger.info("{0} : page added : {1}".format(self._name, result["title"]["raw"])) + + else: + self._logger.error("{0} : page not added due status code : {1}".format(self._name, r.status_code)) + self._logger.debug("{0} : {1}".format(self._name, r.content)) + except ConnectionError as err: + self._logger.error("{0} : Connection error for create page : {1}".format(self._name, err)) + exit(1) + except Exception as err: + self._logger.error("{0} : Exception error for create page : {1}".format(self._name, err)) + + + + def _fromFileTmp(self): try: with open("{0}/{1}.json".format(self._tmp, self._name)) as file: @@ -643,25 +828,7 @@ class WPimport: data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} self._logger.debug("{0} : Data for post : |{1}| : {2}" .format(self._name, title, data)) - params = {"search":author, "per_page":100} - try: - self._logger.info("{0} : Search author : {1}".format(self._name, author)) - page = self._request.get("{1}://{0}/wp-json/wp/v2/users".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, params=params) - self._logger.debug("{0} : End Search author : {1}".format(self._name, author)) - self._logger.debug("{0} : Debug requests : {1}".format(self._name, page.content)) - if page.status_code == 200: - self._logger.info("{0} : Get author id : {1}".format(self._name, result)) - result = page.json() - for a in result: - data["author"] = a["id"] - else: - self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code)) - self._logger.debug("{0} : {1}".format(page.content)) - except ConnectionError as err: - self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err)) - exit(1) - except Exception as err: - self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err)) + data["author"] = self._getAuthor(author) page_is_exist = False for index in range(1,10): diff --git a/lib/WPMenu.py b/lib/WPMenu.py index bcb34a2..bc7c8ea 100644 --- a/lib/WPMenu.py +++ b/lib/WPMenu.py @@ -85,19 +85,28 @@ class WPMenu: def _getIdfromTitlePost(self, content): idMenu = {"id":0, "type":"", "link":""} soup = BeautifulSoup(content, self._parser) - articletitle = soup.find("h2", class_="articletitle").get_text() + articletitle = soup.find_all("h2", class_="articletitle") if len(articletitle) > 0: - articletitle = soup.find("h2").get_text() + articletitle = articletitle[0].get_text() + search = "posts" + post_type = "post" + if len(articletitle) == 0: + articletitle = soup.find_all("div", class_="albumbody") + if len(articletitle) > 0: + articletitle = articletitle[0].find("h2").get_text() + search = "pages" + post_type = "page" + exist = False for index in range(1,10): if exist is False: params = {"search":articletitle, "per_page":100, "page":index} try: - self._logger.debug("{0} : Get Url for post : {1} {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol_wordpress), params)) - page = self._request_wordpress.get("{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol_wordpress), auth=self._basic, params=params) + self._logger.debug("{0} : Get Url for {3} : {1} {2}".format(self._name, "{1}://{0}/wp-json/wp/v2/{2}".format(self._wordpress, self._protocol_wordpress, search), params, search)) + page = self._request_wordpress.get("{1}://{0}/wp-json/wp/v2/{2}".format(self._wordpress, self._protocol_wordpress, search), auth=self._basic, params=params) if page.status_code == 200: result = page.json() - self._logger.info("{0} : Get content post : {1}".format(self._name, len(result))) + self._logger.info("{0} : Get content {2} : {1}".format(self._name, len(result), search)) if len(result) > 0: for i in result: title_rendered = i["title"]["rendered"] @@ -105,8 +114,8 @@ class WPMenu: title_rendered = self._replaceCaracter(title_rendered) self._logger.debug("{0} : comparaison debug {1} {2}".format(self._name, articletitle, title_rendered)) if articletitle == title_rendered: - self._logger.debug("{0} : get post id : {1}".format(self._name, i)) - idMenu = {"id":i["id"], "type":"post", "link": i["link"]} + self._logger.debug("{0} : get {2} id : {1}".format(self._name, i, search)) + idMenu = {"id":i["id"], "type":post_type, "link": i["link"]} exist = True else: self._logger.debug("{0} : {2} {1}".format(self._name, result, len(result))) @@ -156,13 +165,18 @@ class WPMenu: idMenu = {"id":0, "type":"", "link":""} if href != "#": title = href[::-1] + second_title = title.split("/")[2] + second_title = second_title[::-1] link = title.split("/")[0] link = link[::-1] title = title.split("/")[1] title = title[::-1] self._logger.info("{0} link {1} title {2}".format(self._name, link, title)) if link == "index.html": - idMenu = self._getId(title) + if second_title == "albums": + idMenu = self._getIdFromPost(href) + else: + idMenu = self._getId(title) else: idMenu = self._getIdFromPost(href)