53 Commits

Author SHA1 Message Date
351cb10f01 Merge pull request 'fix-media' (#12) from fix-media into master
Reviewed-on: #12
2023-05-23 14:47:07 +00:00
5c5dc707f5 fix headers search author 2023-05-23 16:46:07 +02:00
f69298179a reduce line code and add private method 2023-05-23 13:45:59 +02:00
d3ec7d147d loop replace 2023-05-23 11:22:37 +02:00
0fc6e78a18 fix title rendered 2023-05-23 00:02:51 +02:00
3718b807ba more message debug 2023-05-21 21:14:36 +02:00
75772ba7f0 remove doublon 2023-05-21 21:12:00 +02:00
769b7f43fc fix add or update post 2023-05-18 00:24:41 +02:00
ba42d56be1 fix webpage 2023-05-16 00:15:16 +02:00
d18f4e1579 Add clean 2023-05-15 23:51:45 +02:00
8bdaea3910 add remove command 2023-05-15 23:42:18 +02:00
f3cb5c4069 fix parameters 2023-05-15 23:22:41 +02:00
cfb24bed0e add remove parameters 2023-05-15 23:21:25 +02:00
ee8674fd59 add remove class 2023-05-15 23:13:55 +02:00
ece4d78dd8 add remove all 2023-05-14 18:35:36 +02:00
3d7aa19441 add update 2023-05-12 00:16:58 +02:00
3c2f1cc017 separate publication and principal 2023-05-07 17:38:44 +02:00
f9be6770e3 separate article and page 2023-05-07 09:26:48 +02:00
21d2f35e6e add password parameter and fix post to delete 75% 2023-05-04 00:47:06 +02:00
4789fe80aa fix import 50% 2023-05-02 16:59:31 +02:00
3161a06459 Merge pull request 'thread' (#9) from thread into master
Reviewed-on: #9
2023-05-01 20:05:02 +00:00
1f6bd96a8e add del 2023-05-01 21:58:47 +02:00
b359521001 fix from directory import 2023-05-01 21:44:33 +02:00
73c0998ae0 fix thread fromDirectory and fromUrl 2023-05-01 21:18:57 +02:00
939e744d1d remove draft file 2023-05-01 15:45:59 +02:00
0029898e6e add debug message + fix error directory list 2023-05-01 15:45:34 +02:00
ab3720fbbc fix directory in thread 2023-04-29 22:26:47 +02:00
7a1286c4e2 add thread for directory import 2023-04-28 23:37:13 +02:00
5a4bdbb420 add name thread in message logger 2023-04-28 23:14:57 +02:00
bf4c2480f8 import threading for directory WIP 2023-04-27 00:00:53 +02:00
a0b816fe18 add debug thread 2023-04-26 23:03:43 +02:00
08ff16527d fix thread in parallelism 2023-04-25 16:15:45 +02:00
0acd5067cb thread 50% 2023-04-25 00:34:25 +02:00
aaac2385a3 fix previos commit 2023-04-24 23:16:53 +02:00
88f258ffba Add parallelism 2023-04-24 23:15:29 +02:00
a39e2200bd add function 2023-04-22 00:07:54 +02:00
5a5658d955 Merge pull request 'parent-comment' (#8) from parent-comment into master
Reviewed-on: #8
2023-04-20 19:30:45 +00:00
4e6ae92217 add message error and debug for export 2023-04-20 20:53:50 +02:00
34d6cc39d2 add debug message for error request 2023-04-20 20:48:37 +02:00
c44ffc5a86 double comment 2023-04-20 00:08:56 +02:00
ca39826a11 fix comment parent 75% 2023-04-19 23:53:11 +02:00
f8d103ff61 fix add comment 2023-04-19 23:16:39 +02:00
1c252c9a14 replace post by delete 2023-04-19 22:21:15 +02:00
84cc204007 comment update/add in fixing 2023-04-18 22:01:44 +02:00
edb9442b1c add search tags and categories before create tags and categories 2023-04-18 21:50:36 +02:00
d64aed6240 update error message + add debug 2023-04-18 00:00:32 +02:00
a5e7cb89f7 add error status code 2023-04-17 23:44:09 +02:00
ae7cb1e4e0 remove exit useless 2023-04-16 21:26:48 +02:00
4cf301b216 parent comment 90% 2023-04-16 21:25:32 +02:00
581b6941a6 parent id 75% 2023-04-16 21:06:04 +02:00
bd8ac241c1 debug level comment 2023-04-16 19:32:00 +02:00
0e15e88f31 Get level comment 50% 2023-04-16 19:16:23 +02:00
b54785c455 add parent comment WIP 2023-04-14 23:10:07 +02:00
5 changed files with 734 additions and 238 deletions

6
.gitignore vendored
View File

@@ -1,4 +1,4 @@
backup/ backup*/
backup1/ wp-navigation
web_scrap.log *.log
__pycache__/ __pycache__/

View File

@@ -2,9 +2,119 @@
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
from getpass import getpass from getpass import getpass
from urllib.parse import urlparse from urllib.parse import urlparse
import argparse, logging from concurrent import futures
from concurrent.futures import as_completed, wait
import argparse, logging, threading
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
from lib.WPRemove import WPRemove
def remove(args, basic, logger):
removeWp = WPRemove(basic=basic, wordpress="", logger=logger)
if args.remove == True:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
removeWp.cleanPosts()
removeWp.cleanTags()
removeWp.cleanCategories()
removeWp.cleanMedia()
else:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
if args.posts == True:
removeWp.cleanPosts()
if args.categories == True:
removeWp.cleanCategories()
if args.tags == True:
removeWp.cleanTags()
if args.media == True:
removeWp.cleanMedia()
del removeWp
def download(name_thread, max_thread, url, logger, parser, directory, html, img):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
webpage = exportWp.getUrlPage(name_thread, max_thread)
for i in ["article", "page"]:
for j in ["publications", "principal"]:
if html is False:
exportWp.downloadHTML(webpage[j][i])
if img is False:
exportWp.downloadImg(webpage[j][i])
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
del exportWp
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser)
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k])
del importWp
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser)
webpage = exportWp.getUrlPage(name_thread, max_thread)
del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k])
del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial):
name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",")
wordpress = wordpress.split(",")
if serial is False:
for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser)
for j in directory:
importWp.fromDirectory(j, name_thread, max_thread)
del importWp
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser)
importWp.fromDirectory(directory[i])
del importWp
if __name__ == '__main__': if __name__ == '__main__':
@@ -13,16 +123,35 @@ if __name__ == '__main__':
parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
import_parser = subparsers.add_parser("import") import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True) import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--password", help="password wordpress's user", default="")
import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="") import_parser.add_argument("--directory", help="HTML directory", default="")
import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
import_parser.add_argument("--serial", help="Serial execution", action="store_true") import_parser.add_argument("--serial", help="Serial execution", action="store_true")
import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true")
import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true")
import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true")
import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true")
import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true")
remove_parser = subparsers.add_parser("remove")
remove_parser.add_argument("--user", help="wordpress user", required=True)
remove_parser.add_argument("--password", help="password wordpress's user", default="")
remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true")
remove_parser.add_argument("--posts", help="Remove all posts", action="store_true")
remove_parser.add_argument("--categories", help="Remove all categories", action="store_true")
remove_parser.add_argument("--tags", help="Remove all tags", action="store_true")
remove_parser.add_argument("--media", help="Remove all media", action="store_true")
export_parser = subparsers.add_parser("export") export_parser = subparsers.add_parser("export")
@@ -64,76 +193,48 @@ if __name__ == '__main__':
fileHandler.setFormatter(formatter) fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler) logger.addHandler(fileHandler)
if args.command == "import": if args.command == "import" or args.command == "remove":
password = getpass() password = args.password
if len(password) == 0: if len(args.password) == 0:
logger.error("No password error !!! ") password = getpass()
exit(1) if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
basic = HTTPBasicAuth(args.user, password) basic = HTTPBasicAuth(args.user, password)
if args.command == "import":
wordpress = args.wordpress.split(",") wordpress = args.wordpress.split(",")
importWp = WPimport(basic, "", logger, args.parser) importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser)
if len(args.file) > 0: if len(args.file) > 0:
for i in wordpress: for i in wordpress:
importWp.setUrl(i) importWp.setUrl(i)
importWp.fromFile(args.file.split(",")) importWp.fromFile(files=args.file.split(","))
exit(0)
if len(args.directory) > 0: if len(args.directory) > 0:
directory = args.directory.split(",") remove(args, basic, logger)
if args.serial is False: try:
for i in wordpress: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
importWp.setUrl(i) wait_for = [
for j in directory: ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial)
importWp.fromDirectory(j) for i in range(0, int(args.parallel))
else: ]
if len(directory) != len(wordpress): except Exception as err:
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress") logger.error("Threading error : {0}".format(err))
exit(1)
for i in range(0, len(wordpress)-1):
importWp.setUrl(wordpress[i])
importWp.fromDirectory(directory[i])
exit(0)
if len(args.canalblog) > 0: if len(args.canalblog) > 0:
exportWp = WPExport("", logger, args.parser, args.directory) remove(args, basic, logger)
canalblog = args.canalblog.split(",") try:
wordpress = args.wordpress.split(",") with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
if args.serial is False: ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial)
for canal in canalblog: for i in range(0, int(args.parallel))
try: ]
o = urlparse(canal) except Exception as err:
o = o._replace(scheme="https") logger.error("Threading error : {0}".format(err))
url = o.geturl().replace(":///", "://") exit(0)
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
for j in wordpress:
importWp.setUrl(j)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress")
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
importWp.setUrl(wordpress[i])
importWp.fromUrl(webpage)
if args.command == "export": if args.command == "export":
canalblog = args.url.split(",") canalblog = args.url.split(",")
exportWp = WPExport("", logger, args.parser, args.directory) exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory)
for canal in canalblog: for canal in canalblog:
try: try:
o = urlparse(canal) o = urlparse(canal)
@@ -148,12 +249,22 @@ if __name__ == '__main__':
if args.css is False: if args.css is False:
exportWp.downloadCss() exportWp.downloadCss()
del exportWp
if args.html is False or args.img is False: if args.html is False or args.img is False:
webpage = exportWp.getUrlPage() try:
if args.html is False: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
exportWp.downloadHTML(webpage) wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img)
if args.img is False: for i in range(0, int(args.parallel))
exportWp.downloadImg(webpage) ]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)
if args.command == "remove":
remove(args, basic, logger)
exit(0) exit(0)

View File

@@ -6,11 +6,13 @@ from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
class WPExport: class WPExport:
def __init__(self, url, logger, parser, directory): def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup"):
self._url = url self._url = url
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
self._dir = directory self._dir = directory
self._name = name
self._request = requests.Session() self._request = requests.Session()
@@ -20,8 +22,18 @@ class WPExport:
self._request.mount('http://', HTTPAdapter(max_retries=retries)) self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
self._logger.info("{0} : Export finished for {1}".format(self._name, self._url))
# Public method # Public method
# Set name
def setName(self, name):
self._name = "Thread-{0}".format(int(name) + 1)
# Set URL # Set URL
def setUrl(self, url): def setUrl(self, url):
@@ -55,11 +67,11 @@ class WPExport:
# Get URL # Get URL
def getUrlPage(self): def getUrlPage(self, index_thread, max_thread):
try: try:
page = self._request.get(self._url) page = self._request.get(self._url)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
page_url = [] page_url = []
if page.status_code == 200: if page.status_code == 200:
@@ -69,53 +81,81 @@ class WPExport:
href = anchor.get('href', '/') href = anchor.get('href', '/')
if href != "#": if href != "#":
page_url.append(href) page_url.append(href)
else:
self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
webpage = [] webpage = {"principal": {"page":[], "article":[]}, "publications": {"page":[], "article":[]}}
for i in page_url: for i in page_url:
section = "publications"
o = urlparse(i)
o = o._replace(scheme="https")
i = o.geturl().replace(":///", "://")
if i == "{0}/".format(self._url):
section = "principal"
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("page : {0}".format(i)) self._logger.info("{0} : page : {1}".format(self._name, i))
if i not in webpage: if i not in webpage[section]["page"]:
webpage.append(i) webpage[section]["page"].append(i)
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") class_div = soup.find_all("div", class_="pagingfirstline")
if len(class_div) > 0: if len(class_div) > 0:
pagingfirstline = class_div[0].find_all("a") pagingfirstline = class_div[0].find_all("a")
if len(pagingfirstline) > 1: if len(pagingfirstline) > 1:
lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/")
self._logger.debug("{0} : Last page {1}".format(self._name, lastpage))
element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
number_page = element_lastpage.split("-")[0].split("p")[1] number_page = element_lastpage.split("-")[0].split("p")[1]
number_lastpage = int(number_page) / 10 number_lastpage = int(number_page) / 10
for j in range(1,int(number_lastpage)):
setPageDivided = int(number_lastpage) / max_thread
if setPageDivided > int(setPageDivided):
setPageDivided = setPageDivided + 1
setPagePart = setPageDivided * (index_thread + 1)
firstPagePart = (setPagePart - setPageDivided)
self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage)))
self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart)))
self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart)))
for j in range(int(firstPagePart),int(setPagePart)+1):
paging = j * 10 paging = j * 10
categorie = urlparse(i).path.split("/") categorie = urlparse(i).path.split("/")
url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging) url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging)
if len(categorie) > 2: if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging)
self._logger.info(url_paging) self._logger.info("{0} : {1}".format(self._name, url_paging))
if url_paging not in webpage: if url_paging not in webpage[section]["page"]:
webpage.append(url_paging) webpage[section]["page"].append(url_paging)
page = self._request.get(url_paging) page = self._request.get(url_paging)
if page.status_code == 200: if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
h2 = soup.find_all("h2") h2 = soup.find_all("h2")
self._logger.debug("{0} : {1} H2 : {2}".format(self._name, url_paging, h2))
for title in h2: for title in h2:
self._logger.debug("{0} : {1} a : {2}".format(self._name, url_paging, title.find_all("a")))
href = title.find_all("a")[0].get("href", "/") href = title.find_all("a")[0].get("href", "/")
if href not in webpage: if href not in webpage[section]["article"]:
try: try:
o = urlparse(href) o = urlparse(href)
o = o._replace(scheme="https").geturl() o = o._replace(scheme="https").geturl()
except Exception as err: except Exception as err:
self._logger.error("parsing error : {0}".format(err)) self._logger.error("parsing error : {0}".format(err))
exit(1) exit(1)
webpage.append(o) webpage[section]["article"].append(o)
else:
self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
return webpage return webpage
# Private method # Private method
# #
# Create path # Create path
@@ -127,7 +167,7 @@ class WPExport:
makedir.append(i) makedir.append(i)
repath = "/".join(makedir) repath = "/".join(makedir)
if not os.path.exists(repath): if not os.path.exists(repath):
self._logger.debug("Dossier crée : {0}".format(repath)) self._logger.debug("{0} : Dossier crée : {1}".format(self._name, repath))
try: try:
if len(repath) > 0: if len(repath) > 0:
os.mkdir(repath) os.mkdir(repath)
@@ -179,6 +219,10 @@ class WPExport:
o = o._replace(netloc=u.netloc) o = o._replace(netloc=u.netloc)
o = o._replace(scheme=u.scheme) o = o._replace(scheme=u.scheme)
page_url.append(o.geturl()) page_url.append(o.geturl())
else:
self._logger.error("JS or CSS did not get due status code : {0}".format(page.status_code))
self._logger.debug(page.content)
return page_url return page_url
# Get image # Get image
@@ -189,18 +233,22 @@ class WPExport:
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
img = soup.find_all("img") img = soup.find_all("img")
self._logger.info("image from page: {0} : ".format(i)) self._logger.info("{0} : image from page: {1} : ".format(self._name,i))
for anchor in img: for anchor in img:
src = anchor.get("src", "/") src = anchor.get("src", "/")
if src != "/": if src != "/":
if src not in page_img: if src not in page_img:
self._logger.info("image: {0} : ".format(src)) self._logger.info("{0} : image: {1} : ".format(self._name, src))
page_img.append(src) page_img.append(src)
else:
self._logger.error("{0} : Image did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
return page_img return page_img
@@ -227,9 +275,12 @@ class WPExport:
fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc) fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc)
if len(dir_page_web) > 0 and len(filePageWeb) > 0: if len(dir_page_web) > 0 and len(filePageWeb) > 0:
fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb) fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb)
self._logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) self._logger.info("{0} : {1}/{2} : {3}".format(self._name, i+1, len(webpage), fileDownload))
try: try:
open(fileDownload, "wb").write(r.content) open(fileDownload, "wb").write(r.content)
except Exception as err: except Exception as err:
self._logger.error("file error : {0}".format(err)) self._logger.error("file error : {0}".format(err))
exit(1) exit(1)
else:
self._logger.error("Not download due status code : {0}".format(r.status_code))
self._logger.debug(r.content)

View File

@@ -5,22 +5,30 @@ from urllib.parse import urlparse
import requests, os, logging, re, json import requests, os, logging, re, json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, basic, wordpress, logger, parser): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser"):
self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
self._request = requests.Session() self._request = requests.Session()
retries = Retry(total=10, retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('http://', HTTPAdapter(max_retries=retries)) self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
# Public method # Public method
def setUrl(self, wordpress): def setUrl(self, wordpress):
@@ -31,40 +39,74 @@ class WPimport:
try: try:
r = self._request.get(webpage[i]) r = self._request.get(webpage[i])
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("({0}/{1} : Page en cours d'import : {2}".format(i+1, len(webpage), webpage[i])) self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
soup = BeautifulSoup(r.content, self._parser) soup = BeautifulSoup(r.content, self._parser)
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
if len(articlebody) > 0: if len(articlebody) > 0:
self._addOrUpdatePost(soup) self._addOrUpdatePost(soup)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
else:
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
def fromDirectory(self, directory): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
files = self._getFiles(directories) if len(directories) > 0:
self.fromFile(files) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files): def fromFile(self, files=[], number_thread=1, max_thread=1):
for file in files: divFiles = int(len(files) / max_thread)
if os.path.exists(file): currentRangeFiles = int(divFiles * (number_thread+1))
self._logger.info("Fichier en cours de traitement : {0}".format(file)) firstRange = int(currentRangeFiles - divFiles)
with open(file, 'r') as f: self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
for i in range(firstRange, currentRangeFiles):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i]))
with open(files[i], 'r') as f:
content = f.read() content = f.read()
self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content)))
soup = BeautifulSoup(content, self._parser) soup = BeautifulSoup(content, self._parser)
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody)))
if len(articlebody) > 0: if len(articlebody) > 0:
self._addOrUpdatePost(soup) self._addOrUpdatePost(soup)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
# Private method # Private method
## replace caracter
def _replaceCaracter(self, title_rendered):
list_replace = {'’': "'", '–': '-', '…': '...', '« ': '"', ' »': '"', '« ': '"', ' »': '"', '’': "'", '"‘': "'"}
for old, new in list_replace.items():
title_rendered = title_rendered.replace(old, new)
return title_rendered
## remove space
def _removeSpace(self, title):
if title[len(title)-1] == " ":
title = title[:-1]
if title[0] == " ":
title = title[1:]
return title
## Get all files ## Get all files
def _getFiles(self, item): def _getFiles(self, item):
@@ -95,7 +137,7 @@ class WPimport:
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
@@ -107,7 +149,7 @@ class WPimport:
try: try:
page = self._request.get(img_src) page = self._request.get(img_src)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get featured media : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
name_img = img_src.replace("_q", "") name_img = img_src.replace("_q", "")
@@ -116,23 +158,38 @@ class WPimport:
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error search featured media : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
res = page.json() res = page.json()
if len(res) > 0: if len(res) > 0:
id_media = res[0]["id"] id_media = res[0]["id"]
headers = {'Content-Type': 'application/json', 'Accept':'application/json'}
data = {"featured_media": id_media} data = {"featured_media": id_media}
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=headers, data=json.dumps(data)) r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for post media featured : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Ajout media featured : {0}".format(r.json()["title"]["raw"])) self._logger.info("{0} : Add media featured : {1}".format(self._name, r.json()["title"]["raw"]))
else:
self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
else: else:
self._logger.info("Aucun media trouvé pour {0}".format(h2)) self._logger.info("{0} : No media found for {1}".format(self._name, h2))
else:
self._logger.error("{0} : Connection error with status code for search featured media: {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Connection error for get featured media with status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
## Association image to post ## Association image to post
@@ -142,87 +199,186 @@ class WPimport:
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data) r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for link image to post : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Association d'une image à l'article {0}".format(title)) self._logger.info("{0} : Link image to post {1}".format(self._name, title))
else:
self._logger.error("{0} Connection error with status code for link image to post : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
## Add or update img ## Add or update img
def _addOrUpdateMedia(self, href_img, page): def _addOrUpdateMedia(self, href_img, page):
media_authorized = ["png", "jpg", "jpeg", "svg", "gif"]
media = {"id":"", "rendered":""} media = {"id":"", "rendered":""}
split_fileimg = href_img.split("/") split_fileimg = href_img.split("/")
img_name = split_fileimg[len(split_fileimg)-1] img_name = split_fileimg[len(split_fileimg)-1]
params = { "search": img_name} img_type_file = img_name.split(".")[len(img_name.split("."))-1]
try: is_img = True
r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) if img_type_file not in media_authorized:
except Exception as err: self._logger.error("{0} : Element {1} is not image".format(self._name,img_name))
self._logger.error("Connection error : {0}".format(err)) is_img = False
exit(1) if is_img is True:
if r.status_code == 200: self._logger.debug("{0} : Search for image {1} with URL {2}".format(self._name, img_name, "http://{0}/wp-json/wp/v2/media".format(self._wordpress)))
res = r.json() params = { "search": img_name}
if len(res) > 0:
params = {"force":1}
try:
r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params)
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if r.status_code == 200:
self._logger.info("Image supprimé {0}".format(img_name))
data = page.content
img_type = "image/png"
if img_name.split(".")[1] == "jpg" or img_name.split(".")[1] == "jpeg":
img_type = "image/jpg"
headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)}
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 201: self._logger.debug("{0} : Search for image {1} and his status code {2}".format(self._name, img_name, r.status_code))
self._logger.info("Ajout d'image {0}".format(img_name)) if r.status_code == 200:
res = r.json() res = r.json()
media["id"] = res["id"] self._logger.debug("{0} : Number of image in search : {1}".format(self._name, len(res)))
media["rendered"] = res["guid"]["rendered"] if len(res) > 0:
params = {"force":1}
try:
r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err))
exit(1)
if r.status_code == 200:
self._logger.info("{0} : Image removed {1}".format(self._name, img_name))
else:
self._logger.error("{0} : Image {1} not removed due status code : {2}".format(self._name, img_name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
data = page.content
img_type = "image/{0}".format(img_type_file)
if img_type_file == "jpg":
img_type = "image/jpeg"
headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)}
try:
r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err))
exit(1)
if r.status_code == 201:
self._logger.info("{0} : Image added {1}".format(self._name, img_name))
res = r.json()
media["id"] = res["id"]
media["rendered"] = res["guid"]["rendered"]
else:
self._logger.error("{0} : Image {1}.{2} not added due status code : {3}".format(self._name, img_name, img_type, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
else:
self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
return media return media
## Add or update comment ## Add or update comment
def _addOrUpdateComment(self, post, comment, title): def _addOrUpdateComment(self, post, comment, title):
params = {"post": post}
block = True
try:
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
result = page.json()
for i in comment: for i in comment:
comment_exist = False
for j in result: try:
if i["author"] == j["author_name"] and i["date"] == j["date"]: params = {"post": post, "author_name":i["author"], "date":i["date"]}
comment_exist = True page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
id_comment = j["id"] except Exception as err:
data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"]} self._logger.error("{0} : Connection error for search comment : {1}".format(self._name, err))
if comment_exist is True: exit(1)
if page.status_code == 200:
result = page.json()
for j in result:
try:
params = {"force":1}
page = self._request.delete("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"]), params=params, auth=self._basic)
except Exception as err:
self._logger.error("{0} : Connection error for delete comment : {1}".format(self._name, err))
exit(1)
if page.status_code == 200:
self._logger.info("{0} : Comment deleted for {1}".format(self._name, title))
self._logger.debug("{0} : Comment deleted : {1}".format(self._name, j))
else:
self._logger.error("{0} : Comment not deleted for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.error("{0} : Comment not listed for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
for i in comment:
data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"}
if i["parent_id"] != -1:
parent_id = int(i["parent_id"])
params = {"post": post, "author_name":comment[parent_id]["author"], "date":comment[parent_id]["date"]}
try: try:
page = page = self._request.post("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, id_comment), auth=self._basic, data=data) page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for parent comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("Commentaire mise à jour pour {0}".format(title)) result = page.json()
else: if len(result) > 0:
try: data["parent"]=result[0]["id"]
page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data) else:
except Exception as err: self._logger.error("{0} : Connection error for parent comment with status code : {1}".format(self._name, page.status_code))
self._logger.error("Connection error : {0}".format(err)) self._logger.debug("{0} : {1}".format(self._name, page.content))
exit(1)
if page.status_code == 201:
self._logger.info("Commentaire ajoute pour {0}".format(title))
try:
page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for add comment : {1}".format(self._name, err))
exit(1)
if page.status_code == 201:
self._logger.info("{0} : Comment added for {1}".format(self._name, title))
self._logger.debug("{0} : Data : {1}".format(self._name, data))
else:
self._logger.error("{0} : Comment not added for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
## Check class name
def _hasClassName(self, tag, className):
for i in tag["class"]:
if i == className:
return True
return False
## Get class name
def _getClassName(self, tag, className):
for i in tag["class"]:
if re.match(className, i):
return i
return ""
## Get all comments
def _getComment(self, comment):
comment_post = []
for i in range(0, len(comment)):
comment_div = comment[i].find("div", class_="comment_item")
comment_item = comment_div.text.split("\n")
footer = comment_div.find_all("div", class_="itemfooter")
comment_author = footer[0].text.split(",")[0].replace("Posté par ", "")
comment_date = footer[0].find_all("abbr")[0].get("title")
comment_content = "<p>"
for j in range(0, len(comment_item)-2):
if len(comment_item[j]) > 0:
comment_content = comment_content + comment_item[j] + "<br />"
comment_content = comment_content + "</p>"
parent = -1
if self._hasClassName(comment[i], "level-1") is False:
block = False
className = self._getClassName(comment[i], "level-").split("-")
level = 1
if len(className) > 0:
level = int(className[1])
for j in range(i-1, 0, -1):
if block is False:
levelName = "level-{0}".format(level - 1)
if self._hasClassName(comment[j], levelName) is True:
parent = j
block = True
comment_post.append({"author": comment_author, "date": comment_date, "content": comment_content, "parent_id":parent})
return comment_post
## Add or Update post ## Add or Update post
def _addOrUpdatePost(self, soup): def _addOrUpdatePost(self, soup):
@@ -234,28 +390,24 @@ class WPimport:
listelement = {} listelement = {}
for i in liste: for i in liste:
try: element[i] = []
page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress,i)) listelement[i] = []
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
elements[i] = page.json()
element[i] = []
listelement[i] = []
articletitle = soup.find_all("h2", class_="articletitle") articletitle = soup.find_all("h2", class_="articletitle")
self._logger.debug("{0} : Title of the article : {1}".format(self._name, articletitle))
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
articledate = soup.find_all("span", class_="articledate") articledate = soup.find_all("span", class_="articledate")
articleacreator = soup.find_all("span", class_="articlecreator") articleacreator = soup.find_all("span", class_="articlecreator")
dateheader = soup.find_all("div", class_="dateheader") dateheader = soup.find_all("div", class_="dateheader")
itemfooter = soup.find_all("div", class_="itemfooter") itemfooter = soup.find_all("div", class_="itemfooter")
comment = soup.find_all("div", class_="comment_item") comment = soup.find_all("li", class_="comment")
img_a = articlebody[0].find_all("a", {"target": "_blank"}) img_a = articlebody[0].find_all("a", {"target": "_blank"})
self._logger.debug("{0} : Number of image's link : {1}".format(self._name, len(img_a)))
list_img = [] list_img = []
for i in img_a: for i in img_a:
new_img = {} new_img = {}
img = i.find_all("img") img = i.find_all("img")
self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img)))
if len(img) > 0: if len(img) > 0:
href_a = i.get("href") href_a = i.get("href")
href_img = img[0].get("src") href_img = img[0].get("src")
@@ -264,15 +416,16 @@ class WPimport:
try: try:
page_img = self._request.get(href_img) page_img = self._request.get(href_img)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
if page_img.status_code == 404: if page_img.status_code == 404:
href_img = href_a href_img = href_a
try: try:
page_img = self._request.get(href_a) page_img = self._request.get(href_a)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code))
if page_img.status_code == 200: if page_img.status_code == 200:
media=self._addOrUpdateMedia(href_img, page_img) media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"] new_img["id"]=media["id"]
@@ -283,19 +436,13 @@ class WPimport:
new_img["id"]=media["id"] new_img["id"]=media["id"]
new_img["new_src"]=media["rendered"] new_img["new_src"]=media["rendered"]
list_img.append(new_img) list_img.append(new_img)
if page_img.status_code not in [200, 404]:
self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code))
self._logger.debug("{0} : {1}".format(self._name, page_img.content))
self._logger.debug("{0} : Number of image : {1}".format(self._name, len(list_img)))
comment_post = self._getComment(comment)
comment_post = []
for i in comment:
comment_item = i.text.split("\n")
footer = i.find_all("div", class_="itemfooter")
comment_author = footer[0].text.split(",")[0].replace("Posté par ", "")
comment_date = footer[0].find_all("abbr")[0].get("title")
comment_content = "<p>"
for j in range(0, len(comment_item)-2):
if len(comment_item[j]) > 0:
comment_content = comment_content + comment_item[j] + "<br />"
comment_content = comment_content + "</p>"
comment_post.append({"author": comment_author, "date": comment_date, "content": comment_content})
a = itemfooter[0].find_all("a", {"rel": True}) a = itemfooter[0].find_all("a", {"rel": True})
for i in a: for i in a:
rel = i.get("rel") rel = i.get("rel")
@@ -308,20 +455,61 @@ class WPimport:
for i in liste: for i in liste:
for j in element[i]: for j in element[i]:
element_exist = False element_exist = False
for k in elements[i]: title_element = self._removeSpace(j)
if k["name"] == j: for index in range(1,10):
element_exist = True self._logger.info("{0} : search {1} with index {2} : {3}".format(self._name, i, index, title_element))
listelement[i].append(k["id"])
if element_exist is False:
data = {"name": j}
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, data=data) params = {"search":title_element, "per_page":"100", "page":index}
except Exception as err: page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params)
self._logger.error("Connection error : {0}".format(err)) except ConnectionError as err:
self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err))
exit(1) exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for {1} : {2}".format(self._name, i, err))
if page.status_code == 200:
result = page.json()
self._logger.debug("{0} : content {3} {2} : {1}".format(self._name, result, title_element, i))
if len(result) > 0:
for k in result:
title_rendered = k["name"]
self._logger.debug("{0} : content {2} : {1}".format(self._name, title_rendered, i))
self._logger.debug("{0} : size of content {3} : {2} - {1}".format(self._name, len(title_rendered), len(title_element), i))
if len(title_element) != len(title_rendered):
title_rendered = self._replaceCaracter(title_rendered)
if title_element == title_rendered:
self._logger.info("{0} : {1} found : {2}".format(self._name, i, title_rendered))
element_exist = True
listelement[i].append(k["id"])
else:
break
if page.status_code == 400:
self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
break
else:
self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
self._logger.debug("{0} : Element {3} {2} is {1}".format(self._name, element_exist, title_element, i))
if element_exist == False:
data = {"name": title_element}
self._logger.info("{0} : Create {1} : {2}".format(self._name, i, title_element))
self._logger.debug("{0} : Data : {1}".format(self._name, data))
try:
page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except ConnectionError as err:
self._logger.error("{0} : Connection error for post {1} : {2}".format(self._name, i, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for post {1} : {2}".format(self._name, i, err))
if page.status_code == 201: if page.status_code == 201:
self._logger.info("{0} : {1} created : {2}".format(self._name, i, j))
result = page.json() result = page.json()
listelement[i].append(result["id"]) listelement[i].append(result["id"])
else:
self._logger.error("{0} : {1} not added due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
title = articletitle[0].text title = articletitle[0].text
author = articleacreator[0].text.lower() author = articleacreator[0].text.lower()
@@ -339,51 +527,122 @@ class WPimport:
bodyhtml = bodyhtml.replace(i["old_src"], o.path) bodyhtml = bodyhtml.replace(i["old_src"], o.path)
hour = articledate[0].text hour = articledate[0].text
time = dateheader[0].text.split(" ") time = dateheader[0].text.split(" ")
self._logger.debug("{0} : Title post : |{1}|".format(self._name, title))
title = self._removeSpace(title)
self._logger.debug("{0} : Rendered Title post : |{1}|".format(self._name, title))
data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]}
params = {"search":author} self._logger.debug("{0} : Data for post : |{1}| : {2}" .format(self._name, title, data))
try:
page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params)
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
result = page.json()
data["author"] = result[0]["id"]
params = {"search":title} params = {"search":author, "per_page":100}
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params) self._logger.info("{0} : Search author : {1}".format(self._name, author))
except Exception as err: page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, headers=self._headers_json, params=params)
self._logger.error("Connection error : {0}".format(err)) self._logger.debug("{0} : End Search author : {1}".format(self._name, author))
exit(1) self._logger.debug("{0} : Debug requests : {1}".format(self._name, page.content))
page_exist = True
headers = {'Content-Type': 'application/json', 'Accept':'application/json'}
if page.status_code == 200:
result = page.json()
if len(result) == 0:
page_exist = False
else:
self._logger.info("La page {0} existe deja et mis à jour".format(title))
post_id = result[0]["id"]
try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data))
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
result = page.json()
self._logger.info("Article mis à jour : {0}".format(result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"])
if page_exist == False:
except ConnectionError as err:
self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err))
if page.status_code == 200:
self._logger.info("{0} : Get author id : {1}".format(self._name, result))
result = page.json()
for a in result:
data["author"] = a["id"]
else:
self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(page.content))
page_is_exist = False
for index in range(1,10):
params = {"search": title, "per_page":100, "page": index}
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) self._logger.info("{0} : Search post with index {2} : {1}".format(self._name, title, index))
except Exception as err: page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params, headers=self._headers_json)
self._logger.error("Connection error : {0}".format(err)) except ConnectionError as err:
self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err))
exit(1) exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err))
if page.status_code == 200:
self._logger.debug("{0} : Encoding : {1}".format(self._name, page.encoding))
page.encoding = "utf-8"
result = page.json()
if len(result) == 0:
break
self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result)))
count = 0
for i in result:
title_rendered = i["title"]["rendered"]
self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title))
if len(title_rendered) != len(title):
title_rendered = self._replaceCaracter(title_rendered)
self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title))
self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered)))
if title_rendered == title:
page_is_exist = True
post_id = i["id"]
count = count + 1
if count > 1:
self._logger.info("{0} : Page {1} is double and going to delete".format(self._name, title))
try:
params = {"force":1}
page = self._request.delete("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, params=params)
except ConnectionError as err:
self._logger.error("{0} : Connection error for deleted post : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for deleted post : {1}".format(self._name, err))
if page.status_code == 200:
self._logger.info("{0} : Post deleted : {1}".format(self._name, title))
else:
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i))
self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title))
try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except ConnectionError as err:
self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for update post : {1}".format(self._name, err))
if page.status_code == 200:
result = page.json()
self._logger.info("{0} : Post updated : {1}".format(self._name, title))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else:
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
if page.status_code == 400:
self._logger.error("{0} : Connection for update post unauthorized : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
break
else:
self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
if page_is_exist is False:
try:
self._logger.info("{0} : Creating posts : {1}".format(self._name, data["title"]))
page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except ConnectionError as err:
self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for create post : {1}".format(self._name, err))
if page.status_code == 201: if page.status_code == 201:
result = page.json() result = page.json()
self._logger.info("Article ajoute : {0}".format(result["title"]["raw"])) self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else:
self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))

75
lib/WPRemove.py Normal file
View File

@@ -0,0 +1,75 @@
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPRemove:
# Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None):
self._name = name
self._basic = basic
self._wordpress = wordpress
self._logger = logger
self._headers_json = {'Content-Type': 'application/json', 'Accept':'application/json'}
self._request = requests.Session()
retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('http://', HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
# Public method
def setUrl(self, wordpress):
self._wordpress = wordpress
def cleanPosts(self):
self._removeAll("posts")
def cleanTags(self):
self._removeAll("tags")
def cleanCategories(self):
self._removeAll("categories")
def cleanMedia(self):
self._removeAll("media")
# Private method
def _removeAll(self, composant):
params = {"per_page":100}
try:
self._logger.info("{0} : List {2} to remove for url : {1}".format(self._name, self._wordpress, composant))
r = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, composant), auth=self._basic, params=params, headers=self._headers_json)
except Exception as err:
self._logger.error("{0} : Connection error for list {1} to remove : {2}".format(self._name, composant, err))
if r.status_code == 200:
result = r.json()
if len(result) > 0:
for i in result:
self._logger.info("{0} : Remove {2} for url {1} : {3}".format(self._name, self._wordpress, composant, i["title"]["rendered"]))
params = {"force":1}
try:
r = self._request.delete("http://{0}/wp-json/wp/v2/{1}/{2}".format(self._wordpress, composant, i["id"]), auth=self._basic, headers=self._headers_json , params=params)
if r.status_code == 200:
self._logger.info("{0} : Post removed for URL {1} {2} : {3}".format(self._name, self._wordpress, composant, i["title"]["rendered"]))
else:
self._logger.error("{0} : Connection error for post {1} {2} {3} with status code {4}".format(self._name, self._wordpress, composant, i["title"]["rendered"], r.status_code))
except Exception as err:
self._logger.error("{0} : Connection error for {1} remove : {2}".format(self._name, composant, err))
exit(1)
self._removeAll(composant)
else:
self._logger.error("{0} : Error for list to remove {1} due status code {2}".format(self._name, composant, r.status_code))
self._logger.debug("{0} : Content error for {1} : {2}".format(self._name, composant, r.content))