12 Commits

Author SHA1 Message Date
b88917127d Merge pull request 'webpage-file' (#20) from webpage-file into master
Reviewed-on: #20
2023-06-26 22:28:26 +00:00
781d8959c4 fix tmp directory parameter 2023-06-27 00:25:23 +02:00
a67ff868f3 fix json read file 2023-06-26 23:52:03 +02:00
8e0abc40bd check files tmp 2023-06-26 23:09:54 +02:00
9149a6c5cb rollback webpage 2023-06-26 22:44:42 +02:00
d1b6e8048a add revert files json 2023-06-25 21:16:05 +02:00
0eab1d885b add open file tmp 2023-06-20 21:38:39 +02:00
35ff22d463 change parameter for webpage 2023-06-20 00:17:38 +02:00
7dace5bdb7 add file tmp 2023-06-19 23:58:59 +02:00
703cc8922a Merge pull request 'diff-img' (#19) from diff-img into master
Reviewed-on: #19
2023-06-16 22:08:50 +00:00
ff3ee301fb diff img path done 2023-06-15 00:10:44 +02:00
04da5bc5f6 diff path network 2023-06-13 22:00:51 +02:00
3 changed files with 117 additions and 61 deletions

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from concurrent import futures from concurrent import futures
from concurrent.futures import as_completed, wait, ALL_COMPLETED from concurrent.futures import as_completed, wait, ALL_COMPLETED
import argparse, logging, threading import argparse, logging, threading, os, glob
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
from lib.WPRemove import WPRemove from lib.WPRemove import WPRemove
@@ -40,21 +40,21 @@ def remove(index, number, args, basic, logger, ssl_wordpress):
del removeWp del removeWp
def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog): def download(name_thread, max_thread, url, logger, parser, directory, html, img, ssl_canalblog, revert, tmp):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
for i in ["article", "page"]: for i in ["article", "page"]:
for j in ["publications", "principal"]: for j in ["publications", "principal"]:
if html is False: if html is False:
exportWp.downloadHTML(webpage[j][i]) exportWp.downloadHTML(j, i)
if img is False: if img is False:
exportWp.downloadImg(webpage[j][i]) exportWp.downloadImg(j, i)
del exportWp
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image, revert, tmp):
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial, ssl_wordpress, ssl_canalblog, create, update, image):
canalblog = canalblog.split(",") canalblog = canalblog.split(",")
wordpress = wordpress.split(",") wordpress = wordpress.split(",")
name = "Thread-{0}".format(int(name_thread) + 1) name = "Thread-{0}".format(int(name_thread) + 1)
@@ -70,14 +70,15 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
except Exception as err: except Exception as err:
logger.error("{0} : parsing error : {1}".format(name, err)) logger.error("{0} : parsing error : {1}".format(name, err))
exit(1) exit(1)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog, tmp=tmp)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp del exportWp
for j in wordpress: for j in wordpress:
importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=j, logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]: for k in ["article", "page"]:
for l in ["publications", "principal"]: for l in ["publications", "principal"]:
importWp.fromUrl(webpage[l][k]) importWp.fromUrl(l, k)
del importWp del importWp
else: else:
@@ -93,9 +94,10 @@ def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, bas
logger.error("parsing error : {0}".format(err)) logger.error("parsing error : {0}".format(err))
exit(1) exit(1)
exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog) exportWp = WPExport(name=name, url=url, logger=logger, parser=parser, ssl_canalblog=ssl_canalblog)
webpage = exportWp.getUrlPage(name_thread, max_thread) if not revert:
exportWp.getUrlPage(name_thread, max_thread)
del exportWp del exportWp
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image, tmp=tmp)
for k in ["article", "page"]: for k in ["article", "page"]:
for l in ["publications", "principal"]: for l in ["publications", "principal"]:
@@ -117,7 +119,7 @@ def importDirectory(name_thread, max_thread, directory, logger, parser, wordpres
else: else:
if len(directory) != len(wordpress): if len(directory) != len(wordpress):
logger.error("{0} : Error : Number directory is differant than wordpress".format(name)) logger.error("{0} : Error : Number directory is different than wordpress".format(name))
exit(1) exit(1)
for i in range(0, len(wordpress)-1): for i in range(0, len(wordpress)-1):
importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image) importWp = WPimport(name=name, basic=basic, wordpress=wordpress[i], logger=logger, parser=parser, ssl_wordpress=ssl_wordpress, no_create=create, no_update=update, no_image=image)
@@ -134,6 +136,8 @@ if __name__ == '__main__':
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1) parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="") parser.add_argument("--no-ssl", help="No ssl for canalblog and/or wordpress (example wordpress,canalblog)", dest="ssl", default="")
parser.add_argument("--revert", help="Restart a work from stopping work", action="store_true")
parser.add_argument("--tmp", help="directory tmp", default="/tmp/import_export_canablog")
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
@@ -188,7 +192,6 @@ if __name__ == '__main__':
help="File") help="File")
args = parser.parse_args() args = parser.parse_args()
logger = logging.getLogger('import export canalblog') logger = logging.getLogger('import export canalblog')
@@ -223,6 +226,8 @@ if __name__ == '__main__':
fileHandler.setFormatter(formatter) fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler) logger.addHandler(fileHandler)
os.makedirs(args.tmp, exist_ok=True)
if args.command == "import" or args.command == "remove": if args.command == "import" or args.command == "remove":
password = args.password password = args.password
if len(args.password) == 0: if len(args.password) == 0:
@@ -255,8 +260,17 @@ if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ] wait_for = [ ex.submit(remove, i, args.parallel, args, basic, logger, ssl_wordpress) for i in range(0, int(args.parallel)) ]
wait(wait_for, return_when=ALL_COMPLETED) wait(wait_for, return_when=ALL_COMPLETED)
if args.revert is True:
files_tmp = glob.glob("{0}/*.json".format(args.tmp))
if len(files_tmp) == 0:
logger.error("Error revert, because files not found")
exit(1)
if len(files_tmp) != int(args.parallel):
for file_r in files_tmp:
os.remove(file_r)
wait_for = [ wait_for = [
ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image) ex.submit(importUrl, i, int(args.parallel), args.canalblog, logger, args.parser, args.wordpress, basic, args.serial, ssl_wordpress, ssl_canalblog, args.create, args.update, args.image, args.revert, args.tmp)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
@@ -294,7 +308,7 @@ if __name__ == '__main__':
try: try:
with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex: with futures.ThreadPoolExecutor(max_workers=int(args.parallel)) as ex:
wait_for = [ wait_for = [
ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog) ex.submit(download, i, int(args.parallel), url, logger, args.parser, args.directory, args.html, args.img, ssl_canalblog, args.revert, args.tmp)
for i in range(0, int(args.parallel)) for i in range(0, int(args.parallel))
] ]
except Exception as err: except Exception as err:

View File

@@ -1,12 +1,12 @@
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
import requests, os, argparse, logging import requests, os, argparse, logging, json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
class WPExport: class WPExport:
def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True): def __init__(self, name = "Thread-0", url = "", logger = None, parser = "html.parser", directory = "backup", ssl_canalblog=True, tmp="/tmp/import_export_canablog"):
self._url = url self._url = url
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
@@ -22,7 +22,7 @@ class WPExport:
status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2) status_forcelist=[429, 500, 502, 503, 504], backoff_factor=2)
self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries)) self._request.mount('{0}://'.format(self._protocol), HTTPAdapter(max_retries=retries))
self._tmp = tmp
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -57,13 +57,13 @@ class WPExport:
# Download HTML # Download HTML
def downloadHTML(self, webpage): def downloadHTML(self, first, second):
self._downloadPage(webpage, self._dir) self._downloadPage(webpage[first][second], self._dir)
# Download Image # Download Image
def downloadImg(self, webpage): def downloadImg(self, first, second):
page_src = self._getImg(webpage) page_src = self._getImg(webpage[first][second])
o = urlparse(self._url) o = urlparse(self._url)
self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img")) self._downloadPage(page_src, "{0}/{1}/{2}".format(self._dir, o.path, "img"))
@@ -161,7 +161,14 @@ class WPExport:
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error : {1}".format(self._name, err)) self._logger.error("{0} : Exception error : {1}".format(self._name, err))
exit(1) exit(1)
return webpage try:
string_webpage = json.dumps(webpage)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(string_webpage)
except Exception as ex:
self._logger.error("{0} : Error for writing webpage : {1}".format(self._name, ex))

View File

@@ -8,7 +8,7 @@ from requests.packages.urllib3.util.retry import Retry
class WPimport: class WPimport:
# Constructor # Constructor
def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False): def __init__(self, name="Thread-0", basic=None, wordpress="", logger=None, parser="html.parser", ssl_wordpress=True, no_create=False, no_update=False, no_image=False, tmp="/tmp/import_export_canablog"):
self._name = name self._name = name
self._basic = basic self._basic = basic
self._wordpress = wordpress self._wordpress = wordpress
@@ -16,6 +16,7 @@ class WPimport:
self._parser = parser self._parser = parser
self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'} self._headers_json = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json'}
self._protocol = "https" self._protocol = "https"
self._directory = "backup"
if ssl_wordpress is False: if ssl_wordpress is False:
self._protocol = "http" self._protocol = "http"
self._request = requests.Session() self._request = requests.Session()
@@ -27,6 +28,7 @@ class WPimport:
self._no_create = no_create self._no_create = no_create
self._no_update = no_update self._no_update = no_update
self._no_image = no_image self._no_image = no_image
self._tmp = tmp
# Destructor # Destructor
def __del__(self): def __del__(self):
@@ -37,7 +39,12 @@ class WPimport:
def setUrl(self, wordpress): def setUrl(self, wordpress):
self._wordpress = wordpress self._wordpress = wordpress
def fromUrl(self, webpage): def fromUrl(self, first, second):
try:
with open("{0}/{1}.json".format(self._tmp, self._name)) as file:
webpage_content = json.loads(file.read())
self._logger.debug("{0} : size of webpage : {1}".format(self._name, len(webpage_content)))
webpage = webpage_content[first][second]
for i in range(0, len(webpage)): for i in range(0, len(webpage)):
try: try:
r = self._request.get(webpage[i]) r = self._request.get(webpage[i])
@@ -49,26 +56,39 @@ class WPimport:
self._addOrUpdatePost(soup) self._addOrUpdatePost(soup)
else: else:
self._addOrUpdateFeaturedMedia(soup) self._addOrUpdateFeaturedMedia(soup)
del webpage_content[first][second][i]
webpage_content = json.dumps(webpage_content)
open("{0}/{1}.json".format(self._tmp, self._name), "wt").write(webpage_content)
else: else:
self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code)) self._logger.error("{0} : Connection error for get url {1} with status code : {2}".format(self._name, webpage[i], r.status_code))
self._logger.debug("{0} : {1}".format(self._name, r.content)) self._logger.debug("{0} : {1}".format(self._name, r.content))
except ConnectionError as err: except ConnectionError as err:
self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err)) self._logger.error("{0} : Connection error for get url {1} : {2}".format(self._name, webpage[i], err))
exit(1) exit(1)
except IOError as err:
self._logger.error("{0} : Connection error for IO url {1} : {2}".format(self._name, webpage[i], err))
exit(1)
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err)) self._logger.error("{0} : Exception error for get url {1} : {2}".format(self._name, webpage[i], err))
except Exception as ex:
self._logger.error("{0} : Read file json from tmp : {1}".format(self._name, ex))
def fromDirectory(self, directory="", number_thread=1, max_thread=1): def fromDirectory(self, directory="", number_thread=1, max_thread=1):
self._directory = directory
directory = "{0}/archives".format(directory) directory = "{0}/archives".format(directory)
directories = self._getDirectories([], "{0}".format(directory)) directories = self._getDirectories([], "{0}".format(directory))
if len(directories) > 0: if len(directories) > 0:
files = self._getFiles(directories) files = self._getFiles(directories)
self.fromFile(files, number_thread, max_thread) self.fromFile(files=files, number_thread=number_thread, max_thread=max_thread)
else: else:
self._logger.error("{0} : No files for {1}".format(self._name, directory)) self._logger.error("{0} : No files for {1}".format(self._name, directory))
def fromFile(self, files=[], number_thread=1, max_thread=1): def fromFile(self, files=[], number_thread=1, max_thread=1):
divFiles = int(len(files) / max_thread) divFiles = int(len(files) / max_thread)
currentRangeFiles = int(divFiles * (number_thread+1)) currentRangeFiles = int(divFiles * (number_thread+1))
@@ -445,23 +465,38 @@ class WPimport:
if len(img) > 0: if len(img) > 0:
href_a = i.get("href") href_a = i.get("href")
href_img = img[0].get("src") href_img = img[0].get("src")
href_a_o = urlparse(href_a)
href_img_o = urlparse(href_img)
new_img["old_src"]=href_img new_img["old_src"]=href_img
new_img["old_href"]=href_a new_img["old_href"]=href_a
try: try:
if len(href_img_o.netloc) > 0:
img_ok = False
page_img = self._request.get(href_img) page_img = self._request.get(href_img)
if page_img.status_code == 404: if page_img.status_code == 404:
href_img = href_a href_img = href_a
try: try:
page_img = self._request.get(href_a) page_img = self._request.get(href_a)
if page_img.status_code == 200:
img_ok = True
except ConnectionError as err: except ConnectionError as err:
self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err)) self._logger.error("{0} : Connection error for get image : {1}".format(self._name, err))
exit(1) exit(1)
except Exception as err: except Exception as err:
self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err)) self._logger.error("{0} : Exception error for get image : {1}".format(self._name, err))
exit(1) exit(1)
else:
if os.path.exists("{0}/..{1}".format(self._directory, href_img)):
page_img = open("{0}/..{1}".format(self._directory, href_img), "r")
img_ok = True
else:
if os.path.exists("{0}/..{1}".format(self._directory, href_a)):
page_img = open("{0}/..{1}".format(self._directory, href_a), "r")
img_ok = True
self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code)) self._logger.debug("{0} : Status code for image {1} : {2}".format(self._name, href_img, page_img.status_code))
if page_img.status_code == 200: if img_ok is True:
media=self._addOrUpdateMedia(href_img, page_img) media=self._addOrUpdateMedia(href_img, page_img)
new_img["id"]=media["id"] new_img["id"]=media["id"]
new_img["new_src"]=media["rendered"] new_img["new_src"]=media["rendered"]