24 Commits

Author SHA1 Message Date
b88917127d Merge pull request 'webpage-file' (#20) from webpage-file into master
Reviewed-on: #20
2023-06-26 22:28:26 +00:00
781d8959c4 fix tmp directory parameter 2023-06-27 00:25:23 +02:00
a67ff868f3 fix json read file 2023-06-26 23:52:03 +02:00
8e0abc40bd check files tmp 2023-06-26 23:09:54 +02:00
9149a6c5cb rollback webpage 2023-06-26 22:44:42 +02:00
d1b6e8048a add revert files json 2023-06-25 21:16:05 +02:00
0eab1d885b add open file tmp 2023-06-20 21:38:39 +02:00
35ff22d463 change parameter for webpage 2023-06-20 00:17:38 +02:00
7dace5bdb7 add file tmp 2023-06-19 23:58:59 +02:00
703cc8922a Merge pull request 'diff-img' (#19) from diff-img into master
Reviewed-on: #19
2023-06-16 22:08:50 +00:00
ff3ee301fb diff img path done 2023-06-15 00:10:44 +02:00
04da5bc5f6 diff path network 2023-06-13 22:00:51 +02:00
f01a69a1e7 Merge pull request 'wpchange' (#18) from wpchange into master
Reviewed-on: #18
2023-06-12 22:48:57 +00:00
da4db0277a add img a change 2023-06-13 00:46:18 +02:00
7228911e68 add js and css 2023-06-13 00:38:34 +02:00
9e7e1b27fd change WIP test 2023-06-11 20:24:22 +02:00
16368c13bb add WPChange 2023-06-10 01:58:08 +02:00
c631909cb6 WPchange wip 2023-06-06 00:22:16 +02:00
3e76892676 add wpchange 2023-06-05 23:46:57 +02:00
3e75f05340 Merge pull request 'add-parameter' (#17) from add-parameter into master
Reviewed-on: #17
2023-06-05 20:58:51 +00:00
e48b262d7e add parameter no-image 2023-06-03 09:07:33 +02:00
2f1c081823 add parameter 2023-06-01 15:28:48 +02:00
4bd6f5c038 Merge pull request 'add wait' (#16) from wait_remove into master
Reviewed-on: #16
2023-05-29 21:36:38 +00:00
d3a03e1cb3 add wait 2023-05-29 23:36:11 +02:00
4 changed files with 355 additions and 131 deletions

View File

@@ -3,12 +3,19 @@ from requests.auth import HTTPBasicAuth
from getpass import getpass from getpass import getpass
from urllib.parse import urlparse from urllib.parse import urlparse
from concurrent import futures from concurrent import futures
from concurrent.futures import as_completed, wait from concurrent.futures import as_completed, wait, ALL_COMPLETED
import argparse, logging, threading import argparse, logging, threading, os, glob
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
from lib.WPRemove import WPRemove from lib.WPRemove import WPRemove
from lib.WPChange import WPChange
def change(index, number, args, logger):
changeWp = WPChange(logger=logger, index_name=index, number_thread=number)
changeWp.fromDirectory(args.directory)
del changeWp
def remove(index, number, args, basic, logger, ssl_wordpress): def remove(index, number, args, basic, logger, ssl_wordpress):
removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number) removeWp = WPRemove(basic=basic, wordpress="", logger=logger, ssl_wordpress=ssl_wordpress, index_name=index, number_thread=number)
@@ -33,21 +40,21 @@ def remove(index, number, args, basic, logger, ssl_wordpress):
del removeWp del removeWp
def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog): def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
for i in ["article", "page"]: for i in ["article", "page"]:
for j in ["publications", "principal"]: for j in ["publications", "principal"]:
if html is False: if html is False:
exportWp.downloadHTML(webpage[j][i]) exportWp.downloadHTML(j, i)
if img is False: if img is False:
exportWp.downloadImg(webpage[j][i]) exportWp.downloadImg(j, i)
del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog):
canalblog = canalblog.split(",") canalblog = canalblog.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
@@ -63,14 +70,15 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
except Exception as err: except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err)) logger.error("{0} : parsing error : {1}".format(name, err))
exit(1) exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp del exportWp
for j in wordpress: for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress) importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]: for k in ["article", "page"]:
for l in ["publications", "principal"]: for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k]) importWp.fromUrl(l, k)
del importWp del importWp
else: else:
@@ -86,9 +94,10 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
logger.error("parsing error : {0}".format(err)) logger.error("parsing error : {0}".format(err))
exit(1) exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]: for k in ["article", "page"]:
for l in ["publications", "principal"]: for l in ["publications", "principal"]:
@@ -97,23 +106,23 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
del importWp del importWp
def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress): def importDirectory(name_thread, max_thread, directory, logger, parser, wordpress, basic, serial, ssl_wordpress, create, update, image):
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
directory = directory.split(",") directory = directory.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
if serial is False: if serial is False:
for i in wordpress: for i in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress) importWp = WPimport(name=name, basic=basic, wordpress=i, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
for j in directory: for j in directory:
importWp.fromDirectory(j, name_thread, max_thread) importWp.fromDirectory(j, name_thread, max_thread)
del importWp del importWp
else: else:
if len(directory) != len(wordpress): if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name)) logger.error("{0} : Error : Number directory is different than wordpress".format(name))
exit(1) exit(1)
for i in range(0, len(wordpress)-1): for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
importWp.fromDirectory(directory[i]) importWp.fromDirectory(directory[i])
del importWp del importWp
@@ -127,6 +136,8 @@ if __name__ == '__main__':
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="") parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="")
parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true")
parser.add_argument("--tmp", help="directory tmp", default="/tmp/import_export_canablog")
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
@@ -143,6 +154,10 @@ if __name__ == '__main__':
import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true") import_parser.add_argument("--remove-categories", help="Remove all categories", dest="categories", action="store_true")
import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true") import_parser.add_argument("--remove-tags", help="Remove all tags", dest="tags", action="store_true")
import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true") import_parser.add_argument("--remove-media", help="Remove all media", dest="media", action="store_true")
import_parser.add_argument("--no-create", help="No create post", dest="create", default="store_false", action="store_true")
import_parser.add_argument("--no-update", help="No update post", dest="update", default="store_false", action="store_true")
import_parser.add_argument("--no-image", help="No image add or update", dest="image", default="store_false", action="store_true")
remove_parser = subparsers.add_parser("remove") remove_parser = subparsers.add_parser("remove")
@@ -168,6 +183,13 @@ if __name__ == '__main__':
export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true") export_parser.add_argument("--no-img", help="No img", dest="img", action="store_true")
export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true") export_parser.add_argument("--no-html", help="No HTML", dest="html", action="store_true")
change_parser = subparsers.add_parser("change")
change_parser.add_argument("--directory",
default="",
help="Directory")
change_parser.add_argument("--file",
default="",
help="File")
args = parser.parse_args() args = parser.parse_args()
@@ -204,6 +226,8 @@ if __name__ == '__main__':
fileHandler.setFormatter(formatter) fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler) logger.addHandler(fileHandler)
os.makedirs(args.tmp, exist_ok=True)
if args.command == "import" or args.command == "remove": if args.command == "import" or args.command == "remove":
password = args.password password = args.password
if len(args.password) == 0: if len(args.password) == 0:
@@ -221,23 +245,36 @@ if __name__ == '__main__':
importWp.setUrl(i) importWp.setUrl(i)
importWp.fromFile(files=args.file.split(",")) importWp.fromFile(files=args.file.split(","))
if len(args.directory) > 0: if len(args.directory) > 0:
remove(args, basic, logger, ssl_wordpress)
try: try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED)
wait_for = [ wait_for = [
ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress) ex.submit(importDirectory, i, int(args.parallel), args.directory, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, args.create, args.update, args.image)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
logger.error("Threading error : {0}".format(err)) logger.error("Threading error : {0}".format(err))
if len(args.canalblog) > 0: if len(args.canalblog) > 0:
remove(args, basic, logger, ssl_wordpress)
try: try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED)
if args.revert is True:
files_tmp = glob.glob("{0}/*.json".format(args.tmp))
if len(files_tmp) == 0:
logger.error("Error revert, because files not found")
exit(1)
if len(files_tmp) != int(args.parallel):
for file_r in files_tmp:
os.remove(file_r)
wait_for = [ wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog) ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
logger.error("Threading error : {0}".format(err)) logger.error("Threading error : {0}".format(err))
exit(0) exit(0)
@@ -271,7 +308,7 @@ if __name__ == '__main__':
try: try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog) ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, args.tmp)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:
@@ -286,3 +323,16 @@ if __name__ == '__main__':
except Exception as err: except Exception as err:
logger.error("Thread error for remove : {0}".format(err)) logger.error("Thread error for remove : {0}".format(err))
exit(0) exit(0)
if args.command == "change":
if len(args.directory) > 0:
try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(change, i, args.parallel, args, logger) for i in range(0, int(args.parallel)) ]
except Exception as err:
logger.error("Thread error for remove : {0}".format(err))
if len(args.file) > 0:
changeWp = WPChange(logger=logger)
for filei in args.file.split(","):
changeWp.fromFile(filei)
exit(0)

128
lib/WPChange.py Normal file
View File

@@ -0,0 +1,128 @@
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests, os, logging, re, json
class WPChange:
# Constructor
def __init__(self, index_name=1, number_thread=1, logger=None, parser="html.parser"):
self._name = "Thread-{0}".format(index_name)
self._logger = logger
self._number_thread = number_thread
self._parser = parser
# Destructor
def __del__(self):
print("{0} : Import finished".format(self._name))
# Public method
## from file
def fromFile(self, files=[], number_thread=1, max_thread=1):
divFiles = int(len(files) / max_thread)
currentRangeFiles = int(divFiles * (number_thread))
firstRange = int(currentRangeFiles - divFiles)
self._logger.debug("{0} : index : {1}".format(self._name,number_thread))
self._logger.debug("{0} : first range : {1}".format(self._name,firstRange))
self._logger.debug("{0} : last range : {1}".format(self._name,currentRangeFiles))
for i in range(firstRange, currentRangeFiles):
if os.path.exists(files[i]):
self._logger.info("{0} : ({1}/{2}) File is being processed : {3}".format(self._name, i+1, currentRangeFiles + 1, files[i]))
self._change(files[i])
## From directory
def fromDirectory(self, directory="", number_thread=1, max_thread=1):
directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory))
if len(directories) > 0:
files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread)
else:
self._logger.error("{0} : No files for {1}".format(self._name, directory))
# Private method
## Get all files
def _getFiles(self, item):
files = []
for i in item:
for j in os.listdir(i):
if os.path.isfile("{0}/{1}".format(i, j)):
files.append("{0}/{1}".format(i, j))
return files
## Get directories
def _getDirectories(self, subdirectory, item):
sub = subdirectory
for i in os.listdir(item):
if os.path.isdir("{0}/{1}".format(item, i)):
sub.append("{0}/{1}".format(item, i))
subdirectory = self._getDirectories(sub, "{0}/{1}".format(item, i))
return subdirectory
## Change path img file
def _change(self, file):
ext_img = ["png", "svg", "gif", "jpg", "jpeg"]
try:
with open(file, 'r') as f:
content = f.read()
soup = BeautifulSoup(content, self._parser)
img = soup.find_all("img")
for i in img:
src = i.get("src")
o = urlparse(src)
if len(o.netloc) > 0:
self._logger.info("{0} : Change source image {1} /img/{2}/{3}".format(self._name, src, o.netloc, o.path))
content = content.replace(src, "/img/{0}/{1}".format(o.netloc, o.path))
script = soup.find_all("script", {"type": "text/javascript"})
for i in script:
src = i.get("src")
if src is not None:
o = urlparse(src)
if len(o.netloc) > 0:
self._logger.info("{0} : Change source js {1} /dists/js/{2}/{3}".format(self._name, src, o.netloc, o.path))
content = content.replace(src, "/dists/js/{0}/{1}".format(o.netloc, o.path))
link = soup.find_all("link", {"rel": "stylesheet"})
for i in link:
href = i.get("href")
if href is not None:
o = urlparse(href)
if len(o.netloc) > 0:
self._logger.info("{0} : Change source css {1} /dists/css/{2}/{3}".format(self._name, href, o.netloc, o.path))
content = content.replace(href, "/dists/css/{0}/{1}".format(o.netloc, o.path))
a = soup.find_all("a", {"target": "_blank"})
for i in a:
href = i.get("href")
if href is not None:
o = urlparse(href)
if len(o.netloc) > 0:
ext = o.path.split(".")[len(o.path.split("."))-1]
if ext in ext_img:
self._logger.info("{0} : Change a img {1} /img/{2}/{3}".format(self._name, href, o.netloc, o.path))
content = content.replace(href, "/img/{0}/{1}".format(o.netloc, o.path))
try:
with open(file, "w") as f:
self._logger.info("{0} : File write : {1}".format(self._name, file))
f.write(content)
except Exception as ex:
self._logger.error("{0} : Error for write file {1} : {2}".format(self._name, file, ex))
except Exception as ex:
self._logger.error("{0} : Error for read file {1} : {2}".format(self._name, file, ex))

View File

@@ -1,12 +1,12 @@
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
import requests, os, argparse, logging import requests, os, argparse, logging, json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
class WPExport: class WPExport:
def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True): def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True, tmp="/tmp/import_export_canablog"):
self._url = url self._url = url
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
@@ -22,7 +22,7 @@ class WPExport:
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries)) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
self._tmp = tmp
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -57,13 +57,13 @@ class WPExport:
# Download HTML # Download HTML
def downloadHTML(self, webpage): def downloadHTML(self, first, second):
self._downloadPage(webpage, self._dir) self._downloadPage(webpage[first][second], self._dir)
# Download Image # Download Image
def downloadImg(self, webpage): def downloadImg(self, first, second):
page_src = self._getImg(webpage) page_src = self._getImg(webpage[first][second])
o = urlparse(self._url) o = urlparse(self._url)
self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img")) self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img"))
@@ -161,7 +161,14 @@ class WPExport:
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error : {1}".format(self._name, err)) self._logger.error("{0} : Exception error : {1}".format(self._name, err))
exit(1) exit(1)
return webpage try:
string_webpage = json.dumps(webpage)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(string_webpage)
except Exception as ex:
self._logger.error("{0} : Error for writing webpage : {1}".format(self._name, ex))

View File

@@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"):
self._name = name self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
@@ -16,6 +16,7 @@ class WPimport:
self._parser = parser self._parser = parser
self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'} self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
self._protocol = "https" self._protocol = "https"
self._directory = "backup"
if ssl_wordpress is False: if ssl_wordpress is False:
self._protocol = "http" self._protocol = "http"
self._request = requests.Session() self._request = requests.Session()
@@ -24,7 +25,10 @@ class WPimport:
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries)) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
self._no_create = no_create
self._no_update = no_update
self._no_image = no_image
self._tmp = tmp
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -35,38 +39,56 @@ class WPimport:
def setUrl(self, wordpress): def setUrl(self, wordpress):
self._wordpress = wordpress self._wordpress = wordpress
def fromUrl(self, webpage): def fromUrl(self, first, second):
for i in range(0, len(webpage)): try:
try: with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
r = self._request.get(webpage[i]) webpage_content = json.loads(file.read())
if r.status_code == 200: self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content)))
self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i])) webpage = webpage_content[first][second]
soup = BeautifulSoup(r.content, self._parser) for i in range(0, len(webpage)):
articlebody = soup.find_all("div", class_="articlebody") try:
if len(articlebody) > 0: r = self._request.get(webpage[i])
self._addOrUpdatePost(soup) if r.status_code == 200:
else: self._logger.info("{0} : ({1}/{2}) : Page is importing : {3}".format(self._name, i+1, len(webpage), webpage[i]))
self._addOrUpdateFeaturedMedia(soup) soup = BeautifulSoup(r.content, self._parser)
else: articlebody = soup.find_all("div", class_="articlebody")
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) if len(articlebody) > 0:
self._logger.debug("{0} : {1}".format(self._name, r.content)) self._addOrUpdatePost(soup)
except ConnectionError as err: else:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) self._addOrUpdateFeaturedMedia(soup)
exit(1) del webpage_content[first][second][i]
except Exception as err: webpage_content = json.dumps(webpage_content)
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
else:
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content))
except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except IOError as err:
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def fromDirectory(self, directory="", number_thread=1, max_thread=1): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
self._directory = directory
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
if len(directories) > 0: if len(directories) > 0:
files = self._getFiles(directories) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread) self.fromFile(files=files, number_thread=number_thread, max_thread=max_thread)
else: else:
self._logger.error("{0} : No files for {1}".format(self._name, directory)) self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files=[], number_thread=1, max_thread=1): def fromFile(self, files=[], number_thread=1, max_thread=1):
divFiles = int(len(files) / max_thread) divFiles = int(len(files) / max_thread)
currentRangeFiles = int(divFiles * (number_thread+1)) currentRangeFiles = int(divFiles * (number_thread+1))
@@ -435,48 +457,64 @@ class WPimport:
img_a = articlebody[0].find_all("a", {"target": "_blank"}) img_a = articlebody[0].find_all("a", {"target": "_blank"})
self._logger.debug("{0} : Number of image's link : {1}".format(self._name, len(img_a))) self._logger.debug("{0} : Number of image's link : {1}".format(self._name, len(img_a)))
list_img = [] list_img = []
for i in img_a: if self._no_image is False:
new_img = {} for i in img_a:
img = i.find_all("img") new_img = {}
self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img))) img = i.find_all("img")
if len(img) > 0: self._logger.debug("{0} : Number of image's tag : {1}".format(self._name, len(img)))
href_a = i.get("href") if len(img) > 0:
href_img = img[0].get("src") href_a = i.get("href")
new_img["old_src"]=href_img href_img = img[0].get("src")
new_img["old_href"]=href_a href_a_o = urlparse(href_a)
try: href_img_o = urlparse(href_img)
page_img = self._request.get(href_img) new_img["old_src"]=href_img
new_img["old_href"]=href_a
try:
if len(href_img_o.netloc) > 0:
img_ok = False
page_img = self._request.get(href_img)
if page_img.status_code == 404: if page_img.status_code == 404:
href_img = href_a href_img = href_a
try: try:
page_img = self._request.get(href_a) page_img = self._request.get(href_a)
except ConnectionError as err: if page_img.status_code == 200:
self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) img_ok = True
exit(1) except ConnectionError as err:
except Exception as err: self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err)) exit(1)
exit(1) except Exception as err:
self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code)) self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err))
if page_img.status_code == 200: exit(1)
media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"] else:
new_img["new_src"]=media["rendered"] if os.path.exists("{0}/..{1}".format(self._directory, href_img)):
list_img.append(new_img) page_img = open("{0}/..{1}".format(self._directory, href_img), "r")
if href_img != href_a: img_ok = True
media=self._addOrUpdateMedia(href_a, page_img) else:
if os.path.exists("{0}/..{1}".format(self._directory, href_a)):
page_img = open("{0}/..{1}".format(self._directory, href_a), "r")
img_ok = True
self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code))
if img_ok is True:
media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"] new_img["id"]=media["id"]
new_img["new_src"]=media["rendered"] new_img["new_src"]=media["rendered"]
list_img.append(new_img) list_img.append(new_img)
if page_img.status_code not in [200, 404]: if href_img != href_a:
self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code)) media=self._addOrUpdateMedia(href_a, page_img)
self._logger.debug("{0} : {1}".format(self._name, page_img.content)) new_img["id"]=media["id"]
except ConnectionError as err: new_img["new_src"]=media["rendered"]
self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) list_img.append(new_img)
exit(1) if page_img.status_code not in [200, 404]:
except Exception as err: self._logger.error("{0} : Connection error with status code for get image : {1}".format(self._name, page_img.status_code))
self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err)) self._logger.debug("{0} : {1}".format(self._name, page_img.content))
exit(1) except ConnectionError as err:
self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1)
except Exception as err:
self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err))
exit(1)
self._logger.debug("{0} : Number of image : {1}".format(self._name, len(list_img))) self._logger.debug("{0} : Number of image : {1}".format(self._name, len(list_img)))
comment_post = self._getComment(comment) comment_post = self._getComment(comment)
@@ -615,45 +653,46 @@ class WPimport:
self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title)) self._logger.debug("{0} : Search title posts for |{2}| : |{1}|".format(self._name, title_rendered, title))
self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered))) self._logger.debug("{0} : SIze of title : {1} - {2}".format(self._name, len(title), len(title_rendered)))
if title_rendered == title: if title_rendered == title:
page_is_exist = True if self._no_update is False:
post_id = i["id"] page_is_exist = True
count = count + 1 post_id = i["id"]
if count > 1: count = count + 1
self._logger.info("{0} : Page {1} is double and going to delete".format(self._name, title)) if count > 1:
try: self._logger.info("{0} : Page {1} is double and going to delete".format(self._name, title))
params = {"force":1} try:
page = self._request.delete("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, params=params) params = {"force":1}
if page.status_code == 200: page = self._request.delete("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, params=params)
self._logger.info("{0} : Post deleted : {1}".format(self._name, title)) if page.status_code == 200:
else: self._logger.info("{0} : Post deleted : {1}".format(self._name, title))
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) else:
self._logger.debug("{0} : {1}".format(self._name, page.content)) self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
except ConnectionError as err: self._logger.debug("{0} : {1}".format(self._name, page.content))
self._logger.error("{0} : Connection error for deleted post : {1}".format(self._name, err)) except ConnectionError as err:
exit(1) self._logger.error("{0} : Connection error for deleted post : {1}".format(self._name, err))
except Exception as err: exit(1)
self._logger.error("{0} : Exception error for deleted post : {1}".format(self._name, err)) except Exception as err:
self._logger.error("{0} : Exception error for deleted post : {1}".format(self._name, err))
else: else:
self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i)) self._logger.debug("{0} : Data for post to update : {1}".format(self._name, i))
self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title)) self._logger.info("{0} : Page {1} already exist and going to update".format(self._name, title))
try: try:
page = self._request.post("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) page = self._request.post("{2}://{0}/wp-json/wp/v2/posts/{1}".format(self._wordpress, post_id, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data))
if page.status_code == 200: if page.status_code == 200:
result = page.json() result = page.json()
self._logger.info("{0} : Post updated : {1}".format(self._name, title)) self._logger.info("{0} : Post updated : {1}".format(self._name, title))
self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"]) self._addOrUpdateComment(result["id"], comment_post, result["title"]["raw"])
self._linkImgPost(result["title"]["raw"], list_img, result["id"]) self._linkImgPost(result["title"]["raw"], list_img, result["id"])
else: else:
self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code)) self._logger.error("{0} : Post not updated due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content)) self._logger.debug("{0} : {1}".format(self._name, page.content))
except ConnectionError as err: except ConnectionError as err:
self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err)) self._logger.error("{0} : Connection error for update post : {1}".format(self._name, err))
exit(1) exit(1)
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error for update post : {1}".format(self._name, err)) self._logger.error("{0} : Exception error for update post : {1}".format(self._name, err))
if page.status_code == 400: if page.status_code == 400:
self._logger.error("{0} : Connection for update post unauthorized : {1}".format(self._name, page.status_code)) self._logger.error("{0} : Connection for update post unauthorized : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content)) self._logger.debug("{0} : {1}".format(self._name, page.content))
@@ -667,7 +706,7 @@ class WPimport:
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err)) self._logger.error("{0} : Exception error for search post : {1}".format(self._name, err))
if page_is_exist is False: if page_is_exist is False and self._no_create is False:
try: try:
self._logger.info("{0} : Creating posts : {1}".format(self._name, data["title"])) self._logger.info("{0} : Creating posts : {1}".format(self._name, data["title"]))
page = self._request.post("{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data)) page = self._request.post("{1}://{0}/wp-json/wp/v2/posts".format(self._wordpress, self._protocol), auth=self._basic, headers=self._headers_json, data=json.dumps(data))