40 Commits

Author SHA1 Message Date
fb59746fc0 Merge pull request 'https' (#13) from https into master
Reviewed-on: #13
2023-05-26 09:24:21 +00:00
5916cbff00 fix parameter 2023-05-26 10:04:36 +02:00
cd2fbd5372 add protocol https/http 2023-05-25 00:31:34 +02:00
f3b04f9459 update script backup 2023-05-24 23:34:03 +02:00
a400375e01 remove slugify import 2023-05-24 23:30:23 +02:00
351cb10f01 Merge pull request 'fix-media' (#12) from fix-media into master
Reviewed-on: #12
2023-05-23 14:47:07 +00:00
5c5dc707f5 fix headers search author 2023-05-23 16:46:07 +02:00
f69298179a reduce line code and add private method 2023-05-23 13:45:59 +02:00
d3ec7d147d loop replace 2023-05-23 11:22:37 +02:00
0fc6e78a18 fix title rendered 2023-05-23 00:02:51 +02:00
3718b807ba more message debug 2023-05-21 21:14:36 +02:00
75772ba7f0 remove doublon 2023-05-21 21:12:00 +02:00
769b7f43fc fix add or update post 2023-05-18 00:24:41 +02:00
ba42d56be1 fix webpage 2023-05-16 00:15:16 +02:00
d18f4e1579 Add clean 2023-05-15 23:51:45 +02:00
8bdaea3910 add remove command 2023-05-15 23:42:18 +02:00
f3cb5c4069 fix parameters 2023-05-15 23:22:41 +02:00
cfb24bed0e add remove parameters 2023-05-15 23:21:25 +02:00
ee8674fd59 add remove class 2023-05-15 23:13:55 +02:00
ece4d78dd8 add remove all 2023-05-14 18:35:36 +02:00
3d7aa19441 add update 2023-05-12 00:16:58 +02:00
3c2f1cc017 separate publication and principal 2023-05-07 17:38:44 +02:00
f9be6770e3 separate article and page 2023-05-07 09:26:48 +02:00
21d2f35e6e add password parameter and fix post to delete 75% 2023-05-04 00:47:06 +02:00
4789fe80aa fix import 50% 2023-05-02 16:59:31 +02:00
3161a06459 Merge pull request 'thread' (#9) from thread into master
Reviewed-on: #9
2023-05-01 20:05:02 +00:00
1f6bd96a8e add del 2023-05-01 21:58:47 +02:00
b359521001 fix from directory import 2023-05-01 21:44:33 +02:00
73c0998ae0 fix thread fromDirectory and fromUrl 2023-05-01 21:18:57 +02:00
939e744d1d remove draft file 2023-05-01 15:45:59 +02:00
0029898e6e add debug message + fix error directory list 2023-05-01 15:45:34 +02:00
ab3720fbbc fix directory in thread 2023-04-29 22:26:47 +02:00
7a1286c4e2 add thread for directory import 2023-04-28 23:37:13 +02:00
5a4bdbb420 add name thread in message logger 2023-04-28 23:14:57 +02:00
bf4c2480f8 import threading for directory WIP 2023-04-27 00:00:53 +02:00
a0b816fe18 add debug thread 2023-04-26 23:03:43 +02:00
08ff16527d fix thread in parallelism 2023-04-25 16:15:45 +02:00
0acd5067cb thread 50% 2023-04-25 00:34:25 +02:00
aaac2385a3 fix previos commit 2023-04-24 23:16:53 +02:00
88f258ffba Add parallelism 2023-04-24 23:15:29 +02:00
6 changed files with 646 additions and 288 deletions

6
.gitignore vendored
View File

@@ -1,4 +1,4 @@
backup/ backup*/
backup1/ wp-navigation
web_scrap.log *.log
__pycache__/ __pycache__/

View File

@@ -3,8 +3,8 @@
TAR=/usr/bin/tar TAR=/usr/bin/tar
PYTHON=/usr/bin/python3 PYTHON=/usr/bin/python3
GZIP=/usr/bin/gzip GZIP=/usr/bin/gzip
SCRIPTDIR=/home/valentin/script SCRIPTDIR=/home/valentin/script/webscrap
WEBSCRAP=${SCRIPTDIR}/web_scrap.py WEBSCRAP=${SCRIPTDIR}/import_export_canalblog.py
URL=www.clarissariviere.com URL=www.clarissariviere.com
DATE=$(date +%Y%m%d) DATE=$(date +%Y%m%d)
DIRECTORY=/home/valentin/backup DIRECTORY=/home/valentin/backup
@@ -24,7 +24,7 @@ else
fi fi
subject="${subject} ${URL} ${DATE}" subject="${subject} ${URL} ${DATE}"
echo > ${BACKUPDIR}/${LOGFILE} echo > ${BACKUPDIR}/${LOGFILE}
${PYTHON} ${WEBSCRAP} --url ${URL} --dir ${DIRECTORY} --quiet --logfile ${BACKUPDIR}/${LOGFILE} ${PYTHON} ${WEBSCRAP} --quiet --logfile ${BACKUPDIR}/${LOGFILE} --parallel 20 export --url ${URL} --directory ${DIRECTORY}
if [ ${?} -ne 0 ]; then if [ ${?} -ne 0 ]; then
subject="${subject} echoue : recuperation page" subject="${subject} echoue : recuperation page"
echo ${subject} | mail -s "${subject}" -A ${BACKUPDIR}/${LOGFILE} ${SENDER} echo ${subject} | mail -s "${subject}" -A ${BACKUPDIR}/${LOGFILE} ${SENDER}

View File

@@ -2,39 +2,159 @@
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
from getpass import getpass from getpass import getpass
from urllib.parse import urlparse from urllib.parse import urlparse
import argparse, logging, threading from concurrent import futures
from concurrent.futures import as_completed, wait
import argparse, logging, threading
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
from lib.WPRemove import WPRemove
def remove(args, basic, logger, ssl_wordpress):
removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress)
if args.remove == True:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
removeWp.cleanPosts()
removeWp.cleanTags()
removeWp.cleanCategories()
removeWp.cleanMedia()
else:
for i in args.wordpress.split(","):
removeWp.setUrl(i)
if args.posts == True:
removeWp.cleanPosts()
if args.categories == True:
removeWp.cleanCategories()
if args.tags == True:
removeWp.cleanTags()
if args.media == True:
removeWp.cleanMedia()
del removeWp
def download(exportWp, html, img): def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog):
webpage = exportWp.getUrlPage() exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog)
if html is False: webpage = exportWp.getUrlPage(name_thread, max_thread)
exportWp.downloadHTML(webpage) for i in ["article", "page"]:
for j in ["publications", "principal"]:
if html is False:
exportWp.downloadHTML(webpage[j][i])
if args.img is False: if img is False:
exportWp.downloadImg(webpage) exportWp.downloadImg(webpage[j][i])
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog):
canalblog = canalblog.split(",")
wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1)
protocol = "https"
if ssl_canalblog is False:
protocol = "http"
if serial is False:
for canal in canalblog:
try:
o = urlparse(canal)
o = o._replace(scheme=protocol)
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err))
exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread)
del exportWp
for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress)
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k])
del importWp
else:
if len(canalblog) != len(wordpress):
logger.error("{0} : ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress".format(name))
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme=protocol)
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread)
del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress)
for k in ["article", "page"]:
for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k])
del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress):
name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",")
wordpress = wordpress.split(",")
if serial is False:
for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress)
for j in directory:
importWp.fromDirectory(j, name_thread, max_thread)
del importWp
else:
if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name))
exit(1)
for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress)
importWp.fromDirectory(directory[i])
del importWp
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Verbosity", action="store_true") parser.add_argument("--debug", help="Verbosity", action="store_true")
parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="")
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
import_parser = subparsers.add_parser("import") import_parser = subparsers.add_parser("import")
import_parser.add_argument("--user", help="wordpress user", required=True) import_parser.add_argument("--user", help="wordpress user", required=True)
import_parser.add_argument("--password", help="password wordpress's user", default="")
import_parser.add_argument("--file", help="HTML file", default="") import_parser.add_argument("--file", help="HTML file", default="")
import_parser.add_argument("--directory", help="HTML directory", default="") import_parser.add_argument("--directory", help="HTML directory", default="")
import_parser.add_argument("--canalblog", help="URL Canalblog", default="") import_parser.add_argument("--canalblog", help="URL Canalblog", default="")
import_parser.add_argument("--wordpress", help="URL Wordpress", required=True) import_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
import_parser.add_argument("--serial", help="Serial execution", action="store_true") import_parser.add_argument("--serial", help="Serial execution", action="store_true")
import_parser.add_argument("--remove-all", dest="remove", help="Remove all", action="store_true")
import_parser.add_argument("--remove-posts", help="Remove all posts", dest="posts", action="store_true")
import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true")
import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true")
import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true")
remove_parser = subparsers.add_parser("remove")
remove_parser.add_argument("--user", help="wordpress user", required=True)
remove_parser.add_argument("--password", help="password wordpress's user", default="")
remove_parser.add_argument("--wordpress", help="URL Wordpress", required=True)
remove_parser.add_argument("--all", dest="remove", help="Remove all (posts, media, tags, categories)", action="store_true")
remove_parser.add_argument("--posts", help="Remove all posts", action="store_true")
remove_parser.add_argument("--categories", help="Remove all categories", action="store_true")
remove_parser.add_argument("--tags", help="Remove all tags", action="store_true")
remove_parser.add_argument("--media", help="Remove all media", action="store_true")
export_parser = subparsers.add_parser("export") export_parser = subparsers.add_parser("export")
@@ -54,6 +174,14 @@ if __name__ == '__main__':
logger = logging.getLogger('import export canalblog') logger = logging.getLogger('import export canalblog')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ssl_canalblog = True
ssl_wordpress = True
for i in args.ssl.split(","):
if i == "canalblog":
ssl_canalblog = False
if i == "wordpress":
ssl_wordpress = False
if args.quiet is False: if args.quiet is False:
ch = logging.StreamHandler() ch = logging.StreamHandler()
@@ -76,80 +204,55 @@ if __name__ == '__main__':
fileHandler.setFormatter(formatter) fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler) logger.addHandler(fileHandler)
if args.command == "import": if args.command == "import" or args.command == "remove":
password = getpass() password = args.password
if len(password) == 0: if len(args.password) == 0:
logger.error("No password error !!! ") password = getpass()
exit(1) if len(password) == 0:
logger.error("No password error !!! ")
exit(1)
basic = HTTPBasicAuth(args.user, password) basic = HTTPBasicAuth(args.user, password)
if args.command == "import":
wordpress = args.wordpress.split(",") wordpress = args.wordpress.split(",")
importWp = WPimport(basic, "", logger, args.parser) importWp = WPimport(basic=basic, wordpress="", logger=logger, parser=args.parser, ssl_wordpress=ssl_wordpress)
if len(args.file) > 0: if len(args.file) > 0:
for i in wordpress: for i in wordpress:
importWp.setUrl(i) importWp.setUrl(i)
importWp.fromFile(args.file.split(",")) importWp.fromFile(files=args.file.split(","))
exit(0)
if len(args.directory) > 0: if len(args.directory) > 0:
directory = args.directory.split(",") remove(args, basic, logger, ssl_wordpress)
if args.serial is False: try:
for i in wordpress: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
importWp.setUrl(i) wait_for = [
for j in directory: ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress)
importWp.fromDirectory(j) for i in range(0, int(args.parallel))
else: ]
if len(directory) != len(wordpress): except Exception as err:
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress") logger.error("Threading error : {0}".format(err))
exit(1)
for i in range(0, len(wordpress)-1):
importWp.setUrl(wordpress[i])
importWp.fromDirectory(directory[i])
exit(0)
if len(args.canalblog) > 0: if len(args.canalblog) > 0:
exportWp = WPExport("", logger, args.parser, args.directory) remove(args, basic, logger, ssl_wordpress)
canalblog = args.canalblog.split(",") try:
wordpress = args.wordpress.split(",") with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [
if args.serial is False: ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog)
for canal in canalblog: for i in range(0, int(args.parallel))
try: ]
o = urlparse(canal) except Exception as err:
o = o._replace(scheme="https") logger.error("Threading error : {0}".format(err))
url = o.geturl().replace(":///", "://") exit(0)
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
for j in wordpress:
importWp.setUrl(j)
importWp.fromUrl(webpage)
else:
if len(canalblog) != len(wordpress):
logger.error("ERREUR : Le nombre de dossier n'est pas equivalent au nombre d'URL wordpress")
exit(1)
for i in range(0, len(canalblog)-1):
try:
o = urlparse(canalblog[i])
o = o._replace(scheme="https")
url = o.geturl().replace(":///", "://")
except Exception as err:
logger.error("parsing error : {0}".format(err))
exit(1)
exportWp.setUrl(url)
webpage = exportWp.getUrlPage()
importWp.setUrl(wordpress[i])
importWp.fromUrl(webpage)
if args.command == "export": if args.command == "export":
canalblog = args.url.split(",") canalblog = args.url.split(",")
exportWp = WPExport("", logger, args.parser, args.directory) protocol = "https"
if ssl_canalblog is False:
protocol = "http"
exportWp = WPExport(logger=logger, parser=args.parser, directory=args.directory, ssl_canalblog=ssl_canalblog)
for canal in canalblog: for canal in canalblog:
try: try:
o = urlparse(canal) o = urlparse(canal)
o = o._replace(scheme="https") o = o._replace(scheme=protocol)
url = o.geturl().replace(":///", "://") url = o.geturl().replace(":///", "://")
except Exception as err: except Exception as err:
logger.error("parsing error : {0}".format(err)) logger.error("parsing error : {0}".format(err))
@@ -160,12 +263,22 @@ if __name__ == '__main__':
if args.css is False: if args.css is False:
exportWp.downloadCss() exportWp.downloadCss()
del exportWp
if args.html is False or args.img is False: if args.html is False or args.img is False:
webpage = exportWp.getUrlPage() try:
if args.html is False: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
exportWp.downloadHTML(webpage) wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog)
if args.img is False: for i in range(0, int(args.parallel))
exportWp.downloadImg(webpage) ]
except Exception as err:
logger.error("Threading error : {0}".format(err))
exit(0)
if args.command == "remove":
remove(args, basic, logger, ssl_wordpress)
exit(0) exit(0)

View File

@@ -6,22 +6,36 @@ from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
class WPExport: class WPExport:
def __init__(self, url, logger, parser, directory): def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True):
self._url = url self._url = url
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
self._dir = directory self._dir = directory
self._name = name
self._protocol = "https"
if ssl_canalblog is False:
self._protocol = "http"
self._request = requests.Session() self._request = requests.Session()
retries = Retry(total=10, retries = Retry(total=10,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('http://', HTTPAdapter(max_retries=retries)) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
self._logger.info("{0} : Export finished for {1}".format(self._name, self._url))
# Public method # Public method
# Set name
def setName(self, name):
self._name = "Thread-{0}".format(int(name) + 1)
# Set URL # Set URL
def setUrl(self, url): def setUrl(self, url):
@@ -55,11 +69,11 @@ class WPExport:
# Get URL # Get URL
def getUrlPage(self): def getUrlPage(self, index_thread, max_thread):
try: try:
page = self._request.get(self._url) page = self._request.get(self._url)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
page_url = [] page_url = []
if page.status_code == 200: if page.status_code == 200:
@@ -70,60 +84,80 @@ class WPExport:
if href != "#": if href != "#":
page_url.append(href) page_url.append(href)
else: else:
self._logger.error("Url did not get due status code : {0}".format(page.status_code)) self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
webpage = {"principal": {"page":[], "article":[]}, "publications": {"page":[], "article":[]}}
webpage = []
for i in page_url: for i in page_url:
section = "publications"
o = urlparse(i)
o = o._replace(scheme=self._protocol)
i = o.geturl().replace(":///", "://")
if i == "{0}/".format(self._url):
section = "principal"
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("page : {0}".format(i)) self._logger.info("{0} : page : {1}".format(self._name, i))
if i not in webpage: if i not in webpage[section]["page"]:
webpage.append(i) webpage[section]["page"].append(i)
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") class_div = soup.find_all("div", class_="pagingfirstline")
if len(class_div) > 0: if len(class_div) > 0:
pagingfirstline = class_div[0].find_all("a") pagingfirstline = class_div[0].find_all("a")
if len(pagingfirstline) > 1: if len(pagingfirstline) > 1:
lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/") lastpage = pagingfirstline[len(pagingfirstline)-1].get("href", "/")
self._logger.debug("{0} : Last page {1}".format(self._name, lastpage))
element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
number_page = element_lastpage.split("-")[0].split("p")[1] number_page = element_lastpage.split("-")[0].split("p")[1]
number_lastpage = int(number_page) / 10 number_lastpage = int(number_page) / 10
for j in range(1,int(number_lastpage)):
setPageDivided = int(number_lastpage) / max_thread
if setPageDivided > int(setPageDivided):
setPageDivided = setPageDivided + 1
setPagePart = setPageDivided * (index_thread + 1)
firstPagePart = (setPagePart - setPageDivided)
self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage)))
self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart)))
self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart)))
for j in range(int(firstPagePart),int(setPagePart)+1):
paging = j * 10 paging = j * 10
categorie = urlparse(i).path.split("/") categorie = urlparse(i).path.split("/")
url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging) url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging)
if len(categorie) > 2: if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging)
self._logger.info(url_paging) self._logger.info("{0} : {1}".format(self._name, url_paging))
if url_paging not in webpage: if url_paging not in webpage[section]["page"]:
webpage.append(url_paging) webpage[section]["page"].append(url_paging)
page = self._request.get(url_paging) page = self._request.get(url_paging)
if page.status_code == 200: if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
h2 = soup.find_all("h2") h2 = soup.find_all("h2")
self._logger.debug("{0} : {1} H2 : {2}".format(self._name, url_paging, h2))
for title in h2: for title in h2:
self._logger.debug("{0} : {1} a : {2}".format(self._name, url_paging, title.find_all("a")))
href = title.find_all("a")[0].get("href", "/") href = title.find_all("a")[0].get("href", "/")
if href not in webpage: if href not in webpage[section]["article"]:
try: try:
o = urlparse(href) o = urlparse(href)
o = o._replace(scheme="https").geturl() o = o._replace(scheme="https").geturl()
except Exception as err: except Exception as err:
self._logger.error("parsing error : {0}".format(err)) self._logger.error("parsing error : {0}".format(err))
exit(1) exit(1)
webpage.append(o) webpage[section]["article"].append(o)
else: else:
self._logger.error("web didn't get due status code : {0}".format(page.status_code)) self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
return webpage return webpage
# Private method # Private method
# #
# Create path # Create path
@@ -135,7 +169,7 @@ class WPExport:
makedir.append(i) makedir.append(i)
repath = "/".join(makedir) repath = "/".join(makedir)
if not os.path.exists(repath): if not os.path.exists(repath):
self._logger.debug("Dossier crée : {0}".format(repath)) self._logger.debug("{0} : Dossier crée : {1}".format(self._name, repath))
try: try:
if len(repath) > 0: if len(repath) > 0:
os.mkdir(repath) os.mkdir(repath)
@@ -201,21 +235,21 @@ class WPExport:
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
img = soup.find_all("img") img = soup.find_all("img")
self._logger.info("image from page: {0} : ".format(i)) self._logger.info("{0} : image from page: {1} : ".format(self._name,i))
for anchor in img: for anchor in img:
src = anchor.get("src", "/") src = anchor.get("src", "/")
if src != "/": if src != "/":
if src not in page_img: if src not in page_img:
self._logger.info("image: {0} : ".format(src)) self._logger.info("{0} : image: {1} : ".format(self._name, src))
page_img.append(src) page_img.append(src)
else: else:
self._logger.error("Image did not get due status code : {0}".format(page.status_code)) self._logger.error("{0} : Image did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
return page_img return page_img
@@ -236,14 +270,16 @@ class WPExport:
self._mkdirPath("{0}/{1}/{2}".format(backup_dir, o.netloc, dir_page_web)) self._mkdirPath("{0}/{1}/{2}".format(backup_dir, o.netloc, dir_page_web))
try: try:
r = self._request.get(webpage[i]) r = self._request.get(webpage[i])
except Exception as err: except ConnectionError as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
except Exception as err:
self._logger.error("{0} Exception error : {1}".format(self._name, err))
if r.status_code == 200: if r.status_code == 200:
fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc) fileDownload = "{0}/{1}/index.html".format(backup_dir, o.netloc)
if len(dir_page_web) > 0 and len(filePageWeb) > 0: if len(dir_page_web) > 0 and len(filePageWeb) > 0:
fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb) fileDownload = "{0}/{1}{2}/{3}".format(backup_dir, o.netloc, dir_page_web, filePageWeb)
self._logger.info("{0}/{1} : {2}".format(i+1, len(webpage), fileDownload)) self._logger.info("{0} : {1}/{2} : {3}".format(self._name, i+1, len(webpage), fileDownload))
try: try:
open(fileDownload, "wb").write(r.content) open(fileDownload, "wb").write(r.content)
except Exception as err: except Exception as err:

View File

@@ -8,19 +8,27 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, basic, wordpress, logger, parser): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True):
self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
self._headers_json = {'Content-Type': 'application/json', 'Accept':'application/json'} self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
self._protocol = "https"
if ssl_wordpress is False:
self._protocol = "http"
self._request = requests.Session() self._request = requests.Session()
retries = Retry(total=10, retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('http://', HTTPAdapter(max_retries=retries)) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
# Public method # Public method
@@ -32,10 +40,10 @@ class WPimport:
try: try:
r = self._request.get(webpage[i]) r = self._request.get(webpage[i])
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("({0}/{1} : Page en cours d'import : {2}".format(i+1, len(webpage), webpage[i])) self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
soup = BeautifulSoup(r.content, self._parser) soup = BeautifulSoup(r.content, self._parser)
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
if len(articlebody) > 0: if len(articlebody) > 0:
@@ -43,32 +51,63 @@ class WPimport:
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
def fromDirectory(self, directory): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
files = self._getFiles(directories) if len(directories) > 0:
self.fromFile(files) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files): def fromFile(self, files=[], number_thread=1, max_thread=1):
for file in files: divFiles = int(len(files) / max_thread)
if os.path.exists(file): currentRangeFiles = int(divFiles * (number_thread+1))
self._logger.info("Fichier en cours de traitement : {0}".format(file)) firstRange = int(currentRangeFiles - divFiles)
with open(file, 'r') as f: self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
for i in range(firstRange, currentRangeFiles):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i]))
with open(files[i], 'r') as f:
content = f.read() content = f.read()
self._logger.debug("{0} : Size of article : {1}".format(self._name, len(content)))
soup = BeautifulSoup(content, self._parser) soup = BeautifulSoup(content, self._parser)
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
self._logger.debug("{0} : Number of article : {1}".format(self._name, len(articlebody)))
if len(articlebody) > 0: if len(articlebody) > 0:
self._addOrUpdatePost(soup) self._addOrUpdatePost(soup)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
# Private method # Private method
## replace caracter
def _replaceCaracter(self, title_rendered):
list_replace = {'’': "'", '–': '-', '…': '...', '« ': '"', ' »': '"', '« ': '"', ' »': '"', '’': "'", '"‘': "'"}
for old, new in list_replace.items():
title_rendered = title_rendered.replace(old, new)
return title_rendered
## remove space
def _removeSpace(self, title):
if title[len(title)-1] == " ":
title = title[:-1]
if title[0] == " ":
title = title[1:]
return title
## Get all files ## Get all files
def _getFiles(self, item): def _getFiles(self, item):
@@ -97,9 +136,9 @@ class WPimport:
h2 = i.find_all("h2")[0].text h2 = i.find_all("h2")[0].text
params = {"search":h2, "type":"post"} params = {"search":h2, "type":"post"}
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/search".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("{1}://{0}/wp-json/wp/v2/search".format(self._wordpress, self._protocol), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
@@ -111,16 +150,16 @@ class WPimport:
try: try:
page = self._request.get(img_src) page = self._request.get(img_src)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get featured media : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
name_img = img_src.replace("_q", "") name_img = img_src.replace("_q", "")
name_img = name_img.split("/")[len(name_img.split("/"))-1] name_img = name_img.split("/")[len(name_img.split("/"))-1]
params = {"search": name_img} params = {"search": name_img}
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("{1}://{0}/wp-json/wp/v2/media".format(self._wordpress, self._protocol), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error search featured media : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
res = page.json() res = page.json()
@@ -128,29 +167,29 @@ class WPimport:
id_media = res[0]["id"] id_media = res[0]["id"]
data = {"featured_media": id_media} data = {"featured_media": id_media}
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"]), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) r = self._request.post("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, result[0]["id"], self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for post media featured : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Ajout media featured : {0}".format(r.json()["title"]["raw"])) self._logger.info("{0} : Add media featured : {1}".format(self._name, r.json()["title"]["raw"]))
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
else: else:
self._logger.info("Aucun media trouvé pour {0}".format(h2)) self._logger.info("{0} : No media found for {1}".format(self._name, h2))
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error with status code for search featured media: {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error for get featured media with status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Connection error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error with status code for featured media : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
## Association image to post ## Association image to post
@@ -159,66 +198,76 @@ class WPimport:
for i in list_img: for i in list_img:
data = {"post": post_id} data = {"post": post_id}
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"]), auth=self._basic, data=data) r = self._request.post("{2}://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, i["id"], self._protocol), auth=self._basic, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for link image to post : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 200: if r.status_code == 200:
self._logger.info("Association d'une image à l'article {0}".format(title)) self._logger.info("{0} : Link image to post {1}".format(self._name, title))
else: else:
self._logger.error("Connection error with status code : {0}".format(r.status_code)) self._logger.error("{0} Connection error with status code for link image to post : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))
## Add or update img ## Add or update img
def _addOrUpdateMedia(self, href_img, page): def _addOrUpdateMedia(self, href_img, page):
media_authorized = ["png", "jpg", "jpeg", "svg", "gif"]
media = {"id":"", "rendered":""} media = {"id":"", "rendered":""}
split_fileimg = href_img.split("/") split_fileimg = href_img.split("/")
img_name = split_fileimg[len(split_fileimg)-1] img_name = split_fileimg[len(split_fileimg)-1]
params = { "search": img_name} img_type_file = img_name.split(".")[len(img_name.split("."))-1]
try: is_img = True
r = self._request.get("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, params=params) if img_type_file not in media_authorized:
except Exception as err: self._logger.error("{0} : Element {1} is not image".format(self._name,img_name))
self._logger.error("Connection error : {0}".format(err)) is_img = False
exit(1) if is_img is True:
if r.status_code == 200: self._logger.debug("{0} : Search for image {1} with URL {2}".format(self._name, img_name, "{1}://{0}/wp-json/wp/v2/media".format(self._wordpress, self._protocol)))
res = r.json() params = { "search": img_name}
if len(res) > 0:
params = {"force":1}
try:
r = self._request.delete("http://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"]), auth=self._basic, params=params)
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if r.status_code == 200:
self._logger.info("Image removed {0}".format(img_name))
else:
self._logger.error("Image not removed due status code : {0}".format(r.status_code))
self._logger.debug(r.content)
data = page.content
img_type = "image/png"
if img_name.split(".")[1] == "jpg" or img_name.split(".")[1] == "jpeg":
img_type = "image/jpg"
headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)}
try: try:
r = self._request.post("http://{0}/wp-json/wp/v2/media".format(self._wordpress), auth=self._basic, headers=headers, data=data) r = self._request.get("{1}://{0}/wp-json/wp/v2/media".format(self._wordpress, self._protocol), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for search media : {1}".format(self._name, err))
exit(1) exit(1)
if r.status_code == 201: self._logger.debug("{0} : Search for image {1} and his status code {2}".format(self._name, img_name, r.status_code))
self._logger.info("Image added {0}".format(img_name)) if r.status_code == 200:
res = r.json() res = r.json()
media["id"] = res["id"] self._logger.debug("{0} : Number of image in search : {1}".format(self._name, len(res)))
media["rendered"] = res["guid"]["rendered"] if len(res) > 0:
else: params = {"force":1}
self._logger.error("Image not added due status code : {0}".format(r.status_code)) try:
self._logger.debug(r.content) r = self._request.delete("{2}://{0}/wp-json/wp/v2/media/{1}".format(self._wordpress, res[0]["id"], self._protocol), auth=self._basic, params=params)
except Exception as err:
self._logger.error("{0} Connection error for delete image : {1}".format(self._name, err))
exit(1)
if r.status_code == 200:
self._logger.info("{0} : Image removed {1}".format(self._name, img_name))
else:
self._logger.error("{0} : Image {1} not removed due status code : {2}".format(self._name, img_name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
else: data = page.content
self._logger.error("Connection error with status code : {0}".format(r.status_code)) img_type = "image/{0}".format(img_type_file)
self._logger.debug(r.content) if img_type_file == "jpg":
img_type = "image/jpeg"
headers={ 'Content-Type': img_type,'Content-Disposition' : 'attachment; filename={0}'.format(img_name)}
try:
r = self._request.post("{1}://{0}/wp-json/wp/v2/media".format(self._wordpress, self._protocol), auth=self._basic, headers=headers, data=data)
except Exception as err:
self._logger.error("{0} : Connection error for add image : {1}".format(self._name, err))
exit(1)
if r.status_code == 201:
self._logger.info("{0} : Image added {1}".format(self._name, img_name))
res = r.json()
media["id"] = res["id"]
media["rendered"] = res["guid"]["rendered"]
else:
self._logger.error("{0} : Image {1}.{2} not added due status code : {3}".format(self._name, img_name, img_type, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
else:
self._logger.error("{0} : Connection error for search image with status code : {1}".format(self._name, r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
return media return media
@@ -229,29 +278,29 @@ class WPimport:
try: try:
params = {"post": post, "author_name":i["author"], "date":i["date"]} params = {"post": post, "author_name":i["author"], "date":i["date"]}
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("{1}://{0}/wp-json/wp/v2/comments".format(self._wordpress, self._protocol), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for search comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
for j in result: for j in result:
try: try:
params = {"force":1} params = {"force":1}
page = self._request.delete("http://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"]), params=params, auth=self._basic) page = self._request.delete("{2}://{0}/wp-json/wp/v2/comments/{1}".format(self._wordpress, j["id"], self._protocol), params=params, auth=self._basic)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for delete comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("Comment deleted for {0}".format(title)) self._logger.info("{0} : Comment deleted for {1}".format(self._name, title))
self._logger.debug("Comment deleted : {0}".format(j)) self._logger.debug("{0} : Comment deleted : {1}".format(self._name, j))
else: else:
self._logger.error("Comment not deleted for {0} due status code : {1}".format(title, page.status_code)) self._logger.error("{0} : Comment not deleted for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
else: else:
self._logger.error("Comment not listed for {0} due status code : {1}".format(title, page.status_code)) self._logger.error("{0} : Comment not listed for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
for i in comment: for i in comment:
data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"} data = {"post": post, "content": i["content"], "date": i["date"], "author_name": i["author"], "status": "approved"}
@@ -260,29 +309,29 @@ class WPimport:
parent_id = int(i["parent_id"]) parent_id = int(i["parent_id"])
params = {"post": post, "author_name":comment[parent_id]["author"], "date":comment[parent_id]["date"]} params = {"post": post, "author_name":comment[parent_id]["author"], "date":comment[parent_id]["date"]}
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, params=params) page = self._request.get("{1}://{0}/wp-json/wp/v2/comments".format(self._wordpress, self._protocol), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for parent comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
if len(result) > 0: if len(result) > 0:
data["parent"]=result[0]["id"] data["parent"]=result[0]["id"]
else: else:
self._logger.error("Connection error for parent comment with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error for parent comment with status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/comments".format(self._wordpress), auth=self._basic, data=data) page = self._request.post("{1}://{0}/wp-json/wp/v2/comments".format(self._wordpress, self._protocol), auth=self._basic, data=data)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for add comment : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 201: if page.status_code == 201:
self._logger.info("Comment added for {0}".format(title)) self._logger.info("{0} : Comment added for {1}".format(self._name, title))
self._logger.debug("Data : {0}".format(data)) self._logger.debug("{0} : Data : {1}".format(self._name, data))
else: else:
self._logger.error("Comment not added for {0} due status code : {1}".format(title, page.status_code)) self._logger.error("{0} : Comment not added for {1} due status code : {2}".format(self._name, title, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
## Check class name ## Check class name
@@ -346,6 +395,7 @@ class WPimport:
listelement[i] = [] listelement[i] = []
articletitle = soup.find_all("h2", class_="articletitle") articletitle = soup.find_all("h2", class_="articletitle")
self._logger.debug("{0} : Title of the article : {1}".format(self._name, articletitle))
articlebody = soup.find_all("div", class_="articlebody") articlebody = soup.find_all("div", class_="articlebody")
articledate = soup.find_all("span", class_="articledate") articledate = soup.find_all("span", class_="articledate")
articleacreator = soup.find_all("span", class_="articlecreator") articleacreator = soup.find_all("span", class_="articlecreator")
@@ -353,10 +403,12 @@ class WPimport:
itemfooter = soup.find_all("div", class_="itemfooter") itemfooter = soup.find_all("div", class_="itemfooter")
comment = soup.find_all("li", class_="comment") comment = soup.find_all("li", class_="comment")
img_a = articlebody[0].find_all("a", {"target": "_blank"}) img_a = articlebody[0].find_all("a", {"target": "_blank"})
self._logger.debug("{0} : Number of image's link : {1}".format(self._name, len(img_a)))
list_img = [] list_img = []
for i in img_a: for i in img_a:
new_img = {} new_img = {}
img = i.find_all("img") img = i.find_all("img")
self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img)))
if len(img) > 0: if len(img) > 0:
href_a = i.get("href") href_a = i.get("href")
href_img = img[0].get("src") href_img = img[0].get("src")
@@ -365,16 +417,16 @@ class WPimport:
try: try:
page_img = self._request.get(href_img) page_img = self._request.get(href_img)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
if page_img.status_code == 404: if page_img.status_code == 404:
href_img = href_a href_img = href_a
try: try:
page_img = self._request.get(href_a) page_img = self._request.get(href_a)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code))
if page_img.status_code == 200: if page_img.status_code == 200:
media=self._addOrUpdateMedia(href_img, page_img) media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"] new_img["id"]=media["id"]
@@ -386,10 +438,10 @@ class WPimport:
new_img["new_src"]=media["rendered"] new_img["new_src"]=media["rendered"]
list_img.append(new_img) list_img.append(new_img)
if page_img.status_code not in [200, 404]: if page_img.status_code not in [200, 404]:
self._logger.error("Connection error with status code : {0}".format(page_img.status_code)) self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code))
self._logger.debug(page_img.content) self._logger.debug("{0} : {1}".format(self._name, page_img.content))
self._logger.debug("{0} : Number of image : {1}".format(self._name, len(list_img)))
comment_post = self._getComment(comment) comment_post = self._getComment(comment)
a = itemfooter[0].find_all("a", {"rel": True}) a = itemfooter[0].find_all("a", {"rel": True})
@@ -404,38 +456,61 @@ class WPimport:
for i in liste: for i in liste:
for j in element[i]: for j in element[i]:
element_exist = False element_exist = False
try: title_element = self._removeSpace(j)
params = {"params":j} for index in range(1,10):
page = self._request.get("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, params=params) self._logger.info("{0} : search {1} with index {2} : {3}".format(self._name, i, index, title_element))
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
element_exist = True
result = page.json()
listelement[i].append(result[0]["id"])
else:
self._logger.error("{0} not found due status code : {1}".format(i, page.status_code))
self._logger.debug(page.content)
if element_exist is False:
data = {"name": j}
self._logger.debug("URL : {0} ".format("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i)))
self._logger.debug("data : {0}".format(data))
self._logger.debug("headers : {0}".format(self._headers_form))
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i), auth=self._basic, headers=self._headers_json, data=data) params = {"search":title_element, "per_page":"100", "page":index}
except Exception as err: page = self._request.get("{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol), auth=self._basic, params=params)
self._logger.error("Connection error : {0}".format(err)) except ConnectionError as err:
self._logger.error("{0} : Connection error for {1} : {2}".format(self._name, i, err))
exit(1) exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for {1} : {2}".format(self._name, i, err))
if page.status_code == 200:
result = page.json()
self._logger.debug("{0} : content {3} {2} : {1}".format(self._name, result, title_element, i))
if len(result) > 0:
for k in result:
title_rendered = k["name"]
self._logger.debug("{0} : content {2} : {1}".format(self._name, title_rendered, i))
self._logger.debug("{0} : size of content {3} : {2} - {1}".format(self._name, len(title_rendered), len(title_element), i))
if len(title_element) != len(title_rendered):
title_rendered = self._replaceCaracter(title_rendered)
if title_element == title_rendered:
self._logger.info("{0} : {1} found : {2}".format(self._name, i, title_rendered))
element_exist = True
listelement[i].append(k["id"])
else:
break
if page.status_code == 400:
self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
break
else:
self._logger.error("{0} : {1} not found due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
self._logger.debug("{0} : Element {3} {2} is {1}".format(self._name, element_exist, title_element, i))
if element_exist == False:
data = {"name": title_element}
self._logger.info("{0} : Create {1} : {2}".format(self._name, i, title_element))
self._logger.debug("{0} : Data : {1}".format(self._name, data))
try:
page = self._request.post("{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, i, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except ConnectionError as err:
self._logger.error("{0} : Connection error for post {1} : {2}".format(self._name, i, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for post {1} : {2}".format(self._name, i, err))
if page.status_code == 201: if page.status_code == 201:
self._logger.info("{0} : {1} created : {2}".format(self._name, i, j))
result = page.json() result = page.json()
listelement[i].append(result["id"]) listelement[i].append(result["id"])
else: else:
self._logger.error("{0} not added due status code : {1}".format(i, page.status_code)) self._logger.error("{0} : {1} not added due status code : {2}".format(self._name, i, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
title = articletitle[0].text title = articletitle[0].text
author = articleacreator[0].text.lower() author = articleacreator[0].text.lower()
@@ -453,66 +528,122 @@ class WPimport:
bodyhtml = bodyhtml.replace(i["old_src"], o.path) bodyhtml = bodyhtml.replace(i["old_src"], o.path)
hour = articledate[0].text hour = articledate[0].text
time = dateheader[0].text.split(" ") time = dateheader[0].text.split(" ")
self._logger.debug("{0} : Title post : |{1}|".format(self._name, title))
title = self._removeSpace(title)
self._logger.debug("{0} : Rendered Title post : |{1}|".format(self._name, title))
data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]} data = {"title":title, "content":bodyhtml, "status":"publish", "date": "{0}-{1}-{2}T{3}:00".format(time[2],month[time[1]],time[0], hour), "tags": listelement["tags"], "categories": listelement["categories"]}
params = {"search":author} self._logger.debug("{0} : Data for post : |{1}| : {2}" .format(self._name, title, data))
params = {"search":author, "per_page":100}
try: try:
page = self._request.get("http://{0}/wp-json/wp/v2/users".format(self._wordpress), auth=self._basic, params=params) self._logger.info("{0} : Search author : {1}".format(self._name, author))
except Exception as err: page = self._request.get("{1}://{0}/wp-json/wp/v2/users".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, params=params)
self._logger.error("Connection error : {0}".format(err)) self._logger.debug("{0} : End Search author : {1}".format(self._name, author))
self._logger.debug("{0} : Debug requests : {1}".format(self._name, page.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get author : {1}".format(self._name, err))
exit(1) exit(1)
if page.status_code == 200:
result = page.json()
data["author"] = result[0]["id"]
else:
self._logger.error("Connection error with status code : {0}".format(page.status_code))
self._logger.debug(page.content)
params = {"search":title}
try:
page = self._request.get("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, params=params)
except Exception as err: except Exception as err:
self._logger.error("Connection error : {0}".format(err)) self._logger.error("{0} : Exception error for get author : {1}".format(self._name, err))
exit(1)
page_exist = True
headers = {'Content-Type': 'application/json', 'Accept':'application/json'}
if page.status_code == 200: if page.status_code == 200:
self._logger.info("{0} : Get author id : {1}".format(self._name, result))
result = page.json() result = page.json()
if len(result) == 0: for a in result:
page_exist = False data["author"] = a["id"]
else:
self._logger.info("La page {0} existe deja et mis à jour".format(title))
post_id = result[0]["id"]
try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id), auth=self._basic, headers=headers, data=json.dumps(data))
except Exception as err:
self._logger.error("Connection error : {0}".format(err))
exit(1)
if page.status_code == 200:
result = page.json()
self._logger.info("Article mis à jour : {0}".format(result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else:
self._logger.error("Post not updated due status code : {0}".format(page.status_code))
self._logger.debug(page.content)
else: else:
self._logger.error("Connection for update post error with status code : {0}".format(page.status_code)) self._logger.error("{0} : Connection error with status code for get author : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(page.content))
page_is_exist = False
for index in range(1,10):
if page_exist == False: params = {"search": title, "per_page":100, "page": index}
try: try:
page = self._request.post("http://{0}/wp-json/wp/v2/posts".format(self._wordpress), auth=self._basic, headers=headers, data=json.dumps(data)) self._logger.info("{0} : Search post with index {2} : {1}".format(self._name, title, index))
except Exception as err: page = self._request.get("{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol), auth=self._basic, params=params, headers=self._headers_json)
self._logger.error("Connection error : {0}".format(err)) except ConnectionError as err:
self._logger.error("{0} : Connection error for search post : {1}".format(self._name, err))
exit(1) exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err))
if page.status_code == 200:
self._logger.debug("{0} : Encoding : {1}".format(self._name, page.encoding))
page.encoding = "utf-8"
result = page.json()
if len(result) == 0:
break
self._logger.info("{0} : Number result posts : {1}".format(self._name, len(result)))
count = 0
for i in result:
title_rendered = i["title"]["rendered"]
self._logger.info("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title))
if len(title_rendered) != len(title):
title_rendered = self._replaceCaracter(title_rendered)
self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title))
self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered)))
if title_rendered == title:
page_is_exist = True
post_id = i["id"]
count = count + 1
if count > 1:
self._logger.info("{0} : Page {1} is double and going to delete".format(self._name, title))
try:
params = {"force":1}
page = self._request.delete("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, params=params)
except ConnectionError as err:
self._logger.error("{0} : Connection error for deleted post : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for deleted post : {1}".format(self._name, err))
if page.status_code == 200:
self._logger.info("{0} : Post deleted : {1}".format(self._name, title))
else:
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
else:
self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i))
self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title))
try:
page = self._request.post("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except ConnectionError as err:
self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for update post : {1}".format(self._name, err))
if page.status_code == 200:
result = page.json()
self._logger.info("{0} : Post updated : {1}".format(self._name, title))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else:
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
if page.status_code == 400:
self._logger.error("{0} : Connection for update post unauthorized : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
break
else:
self._logger.error("{0} : Connection for update post error with status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content))
if page_is_exist is False:
try:
self._logger.info("{0} : Creating posts : {1}".format(self._name, data["title"]))
page = self._request.post("{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
except ConnectionError as err:
self._logger.error("{0} : Connection error for create post : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for create post : {1}".format(self._name, err))
if page.status_code == 201: if page.status_code == 201:
result = page.json() result = page.json()
self._logger.info("Article ajoute : {0}".format(result["title"]["raw"])) self._logger.info("{0} : Post added : {1}".format(self._name, result["title"]["raw"]))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else: else:
self._logger.error("Post not added due status code : {0}".format(r.status_code)) self._logger.error("{0} : Post not added due status code : {1}".format(self._name, r.status_code))
self._logger.debug(r.content) self._logger.debug("{0} : {1}".format(self._name, r.content))

78
lib/WPRemove.py Normal file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class WPRemove:
# Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, ssl_wordpress=True):
self._name = name
self._basic = basic
self._wordpress = wordpress
self._logger = logger
self._headers_json = {'Content-Type': 'application/json', 'Accept':'application/json'}
self._protocol = "https"
if ssl_wordpress is False:
self._protocol = "http"
self._request = requests.Session()
retries = Retry(connect=10, read=10, redirect=5,
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
# Destructor
def __del__(self):
print("{0} : Import finished for {1}".format(self._name, self._wordpress))
# Public method
def setUrl(self, wordpress):
self._wordpress = wordpress
def cleanPosts(self):
self._removeAll("posts")
def cleanTags(self):
self._removeAll("tags")
def cleanCategories(self):
self._removeAll("categories")
def cleanMedia(self):
self._removeAll("media")
# Private method
def _removeAll(self, composant):
params = {"per_page":100}
try:
self._logger.info("{0} : List {2} to remove for url : {1}".format(self._name, self._wordpress, composant))
r = self._request.get("{2}://{0}/wp-json/wp/v2/{1}".format(self._wordpress, composant, self._protocol), auth=self._basic, params=params, headers=self._headers_json)
except Exception as err:
self._logger.error("{0} : Connection error for list {1} to remove : {2}".format(self._name, composant, err))
if r.status_code == 200:
result = r.json()
if len(result) > 0:
for i in result:
self._logger.info("{0} : Remove {2} for url {1} : {3}".format(self._name, self._wordpress, composant, i["title"]["rendered"]))
params = {"force":1}
try:
r = self._request.delete("{3}://{0}/wp-json/wp/v2/{1}/{2}".format(self._wordpress, composant, i["id"], self._protocol), auth=self._basic, headers=self._headers_json , params=params)
if r.status_code == 200:
self._logger.info("{0} : Post removed for URL {1} {2} : {3}".format(self._name, self._wordpress, composant, i["title"]["rendered"]))
else:
self._logger.error("{0} : Connection error for post {1} {2} {3} with status code {4}".format(self._name, self._wordpress, composant, i["title"]["rendered"], r.status_code))
except Exception as err:
self._logger.error("{0} : Connection error for {1} remove : {2}".format(self._name, composant, err))
exit(1)
self._removeAll(composant)
else:
self._logger.error("{0} : Error for list to remove {1} due status code {2}".format(self._name, composant, r.status_code))
self._logger.debug("{0} : Content error for {1} : {2}".format(self._name, composant, r.content))