2023-04-08 12:27:30 +02:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib . parse import urlparse
import requests , os , logging , re , json
2023-04-13 21:54:35 +02:00
from requests . adapters import HTTPAdapter
from requests . packages . urllib3 . util . retry import Retry
2023-04-08 12:27:30 +02:00
class WPimport :
# Constructor
2023-06-26 22:44:42 +02:00
def __init__ ( self , name = " Thread-0 " , basic = None , wordpress = " " , logger = None , parser = " html.parser " , ssl_wordpress = True , no_create = False , no_update = False , no_image = False , tmp = " /tmp/import_export_canablog " ) :
2023-04-27 00:00:53 +02:00
self . _name = name
2023-04-08 12:27:30 +02:00
self . _basic = basic
self . _wordpress = wordpress
self . _logger = logger
2023-04-09 21:17:49 +02:00
self . _parser = parser
2023-05-21 21:12:00 +02:00
self . _headers_json = { ' Content-Type ' : ' application/json; charset=utf-8 ' , ' Accept ' : ' application/json ' }
2023-05-25 00:31:34 +02:00
self . _protocol = " https "
2023-06-15 00:10:44 +02:00
self . _directory = " backup "
2023-05-25 00:31:34 +02:00
if ssl_wordpress is False :
self . _protocol = " http "
2023-04-13 21:54:35 +02:00
self . _request = requests . Session ( )
2023-05-02 16:59:31 +02:00
retries = Retry ( connect = 10 , read = 10 , redirect = 5 ,
2023-04-13 21:59:12 +02:00
status_forcelist = [ 429 , 500 , 502 , 503 , 504 ] , backoff_factor = 2 )
2023-04-13 21:54:35 +02:00
2023-05-25 00:31:34 +02:00
self . _request . mount ( ' {0} :// ' . format ( self . _protocol ) , HTTPAdapter ( max_retries = retries ) )
2023-06-01 15:28:48 +02:00
self . _no_create = no_create
self . _no_update = no_update
2023-06-03 09:07:33 +02:00
self . _no_image = no_image
2023-06-19 23:58:59 +02:00
self . _tmp = tmp
2023-05-01 21:58:47 +02:00
# Destructor
def __del__ ( self ) :
2023-05-02 16:59:31 +02:00
print ( " {0} : Import finished for {1} " . format ( self . _name , self . _wordpress ) )
2023-05-01 21:58:47 +02:00
2023-04-08 12:27:30 +02:00
# Public method
2023-04-10 15:41:14 +02:00
def setUrl ( self , wordpress ) :
self . _wordpress = wordpress
2023-06-20 00:17:38 +02:00
def fromUrl ( self , first , second ) :
2023-06-20 21:38:39 +02:00
try :
2023-06-26 23:52:03 +02:00
with open ( " {0} / {1} .json " . format ( self . _tmp , self . _name ) ) as file :
webpage_content = json . loads ( file . read ( ) )
self . _logger . debug ( " {0} : size of webpage : {1} " . format ( self . _name , len ( webpage_content ) ) )
webpage = webpage_content [ first ] [ second ]
for i in range ( 0 , len ( webpage ) ) :
try :
r = self . _request . get ( webpage [ i ] )
if r . status_code == 200 :
self . _logger . info ( " {0} : ( {1} / {2} ) : Page is importing : {3} " . format ( self . _name , i + 1 , len ( webpage ) , webpage [ i ] ) )
soup = BeautifulSoup ( r . content , self . _parser )
articlebody = soup . find_all ( " div " , class_ = " articlebody " )
if len ( articlebody ) > 0 :
self . _addOrUpdatePost ( soup )
else :
2023-08-05 12:03:17 +02:00
albumbody = soup . find_all ( " div " , class_ = " albumbody " )
if len ( albumbody ) > 0 :
self . _addOrUpdateAlbum ( soup )
else :
self . _addOrUpdateFeaturedMedia ( soup )
2023-06-26 23:52:03 +02:00
del webpage_content [ first ] [ second ] [ i ]
webpage_content = json . dumps ( webpage_content )
open ( " {0} / {1} .json " . format ( self . _tmp , self . _name ) , " wt " ) . write ( webpage_content )
2023-06-20 21:38:39 +02:00
else :
2023-06-26 23:52:03 +02:00
self . _logger . error ( " {0} : Connection error for get url {1} with status code : {2} " . format ( self . _name , webpage [ i ] , r . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for get url {1} : {2} " . format ( self . _name , webpage [ i ] , err ) )
exit ( 1 )
except IOError as err :
self . _logger . error ( " {0} : Connection error for IO url {1} : {2} " . format ( self . _name , webpage [ i ] , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for get url {1} : {2} " . format ( self . _name , webpage [ i ] , err ) )
2023-06-20 21:38:39 +02:00
except Exception as ex :
self . _logger . error ( " {0} : Read file json from tmp : {1} " . format ( self . _name , ex ) )
2023-04-10 00:00:01 +02:00
2023-06-27 14:37:45 +02:00
def fromDirectory ( self , directory = " " , number_thread = 1 , max_thread = 1 , revert = False ) :
2023-06-26 22:44:42 +02:00
self . _directory = directory
directory = " {0} /archives " . format ( directory )
directories = self . _getDirectories ( [ ] , " {0} " . format ( directory ) )
if len ( directories ) > 0 :
files = self . _getFiles ( directories )
2023-06-28 23:03:27 +02:00
if revert is False :
2023-06-27 14:37:45 +02:00
self . _tmpFiles ( files = files , number_thread = number_thread , max_thread = max_thread )
2023-06-27 14:48:48 +02:00
self . _fromFileTmp ( )
2023-04-29 22:26:47 +02:00
else :
2023-06-26 22:44:42 +02:00
self . _logger . error ( " {0} : No files for {1} " . format ( self . _name , directory ) )
2023-06-25 21:16:05 +02:00
2023-06-27 14:37:45 +02:00
def fromFile ( self , files = [ ] ) :
2023-06-27 14:48:48 +02:00
for i in range ( 0 , len ( files ) ) :
if os . path . exists ( files [ i ] ) :
2023-06-28 23:03:27 +02:00
self . _logger . info ( " {0} : ( {1} / {2} ) File is being processed : {3} " . format ( self . _name , i + 1 , len ( files ) , files [ i ] ) )
2023-06-27 14:48:48 +02:00
with open ( files [ i ] , ' r ' ) as f :
content = f . read ( )
self . _logger . debug ( " {0} : Size of article : {1} " . format ( self . _name , len ( content ) ) )
soup = BeautifulSoup ( content , self . _parser )
articlebody = soup . find_all ( " div " , class_ = " articlebody " )
self . _logger . debug ( " {0} : Number of article : {1} " . format ( self . _name , len ( articlebody ) ) )
if len ( articlebody ) > 0 :
self . _addOrUpdatePost ( soup )
else :
2023-08-05 12:03:17 +02:00
albumbody = soup . find_all ( " div " , class_ = " albumbody " )
if len ( albumbody ) > 0 :
self . _addOrUpdateAlbum ( soup )
else :
self . _addOrUpdateFeaturedMedia ( soup )
2023-06-27 14:48:48 +02:00
# Private method
2023-08-05 12:03:17 +02:00
def _addOrUpdateAlbum ( self , soup ) :
self . _logger . info ( " {0} : Add/Update Album " . format ( self . _name ) )
2023-06-27 14:48:48 +02:00
def _fromFileTmp ( self ) :
2023-06-27 14:37:45 +02:00
try :
with open ( " {0} / {1} .json " . format ( self . _tmp , self . _name ) ) as file :
files = json . loads ( file . read ( ) )
2023-06-28 23:03:27 +02:00
self . _logger . debug ( " {0} : size of webpage : {1} " . format ( self . _name , len ( files ) ) )
2023-06-27 14:37:45 +02:00
for i in range ( 0 , len ( files ) ) :
if os . path . exists ( files [ i ] ) :
2023-06-28 23:03:27 +02:00
self . _logger . info ( " {0} : ( {1} / {2} ) File is being processed : {3} " . format ( self . _name , i + 1 , len ( files ) , files [ i ] ) )
2023-06-27 14:37:45 +02:00
with open ( files [ i ] , ' r ' ) as f :
content = f . read ( )
self . _logger . debug ( " {0} : Size of article : {1} " . format ( self . _name , len ( content ) ) )
soup = BeautifulSoup ( content , self . _parser )
articlebody = soup . find_all ( " div " , class_ = " articlebody " )
self . _logger . debug ( " {0} : Number of article : {1} " . format ( self . _name , len ( articlebody ) ) )
if len ( articlebody ) > 0 :
self . _addOrUpdatePost ( soup )
else :
self . _addOrUpdateFeaturedMedia ( soup )
except Exception as ex :
self . _logger . error ( " {0} : Read file json from tmp : {1} " . format ( self . _name , ex ) )
2023-06-25 21:16:05 +02:00
2023-04-08 12:27:30 +02:00
2023-06-28 23:03:27 +02:00
def _tmpFiles ( self , files = [ ] , number_thread = 1 , max_thread = 1 ) :
2023-05-01 21:44:33 +02:00
divFiles = int ( len ( files ) / max_thread )
currentRangeFiles = int ( divFiles * ( number_thread + 1 ) )
firstRange = int ( currentRangeFiles - divFiles )
self . _logger . debug ( " {0} : index : {1} " . format ( self . _name , number_thread ) )
2023-04-29 22:26:47 +02:00
2023-05-01 21:44:33 +02:00
self . _logger . debug ( " {0} : first range : {1} " . format ( self . _name , firstRange ) )
self . _logger . debug ( " {0} : last range : {1} " . format ( self . _name , currentRangeFiles ) )
2023-06-27 14:37:45 +02:00
webpage = [ ]
2023-05-01 21:44:33 +02:00
for i in range ( firstRange , currentRangeFiles ) :
2023-06-27 14:37:45 +02:00
webpage . append ( files [ i ] )
try :
string_webpage = json . dumps ( webpage )
open ( " {0} / {1} .json " . format ( self . _tmp , self . _name ) , " wt " ) . write ( string_webpage )
except Exception as ex :
self . _logger . error ( " {0} : Error for writing webpage : {1} " . format ( self . _name , ex ) )
2023-04-08 12:27:30 +02:00
2023-05-23 13:45:59 +02:00
## replace caracter
def _replaceCaracter ( self , title_rendered ) :
list_replace = { ' ’ ' : " ' " , ' – ' : ' - ' , ' … ' : ' ... ' , ' « ' : ' " ' , ' » ' : ' " ' , ' « ' : ' " ' , ' » ' : ' " ' , ' ’ ' : " ' " , ' " ‘ ' : " ' " }
for old , new in list_replace . items ( ) :
title_rendered = title_rendered . replace ( old , new )
return title_rendered
## remove space
def _removeSpace ( self , title ) :
if title [ len ( title ) - 1 ] == " " :
title = title [ : - 1 ]
if title [ 0 ] == " " :
title = title [ 1 : ]
return title
2023-04-08 12:27:30 +02:00
## Get all files
def _getFiles ( self , item ) :
files = [ ]
for i in item :
for j in os . listdir ( i ) :
if os . path . isfile ( " {0} / {1} " . format ( i , j ) ) :
files . append ( " {0} / {1} " . format ( i , j ) )
return files
## Get directories
def _getDirectories ( self , subdirectory , item ) :
sub = subdirectory
for i in os . listdir ( item ) :
if os . path . isdir ( " {0} / {1} " . format ( item , i ) ) :
sub . append ( " {0} / {1} " . format ( item , i ) )
subdirectory = self . _getDirectories ( sub , " {0} / {1} " . format ( item , i ) )
return subdirectory
## Add or update featured media
def _addOrUpdateFeaturedMedia ( self , soup ) :
item_div = soup . find_all ( " div " , { " data-edittype " : " post " } )
for i in item_div :
h2 = i . find_all ( " h2 " ) [ 0 ] . text
params = { " search " : h2 , " type " : " post " }
2023-04-13 22:14:30 +02:00
try :
2023-05-25 00:31:34 +02:00
page = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/search " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , params = params )
2023-05-28 22:07:43 +02:00
if page . status_code == 200 :
result = page . json ( )
if len ( result ) > 0 :
if h2 == result [ 0 ] [ " title " ] :
img = i . find_all ( " img " )
if len ( img ) > 0 :
img_src = img [ 0 ] . get ( " src " )
2023-04-13 22:14:30 +02:00
try :
2023-05-28 22:07:43 +02:00
page = self . _request . get ( img_src )
if page . status_code == 200 :
name_img = img_src . replace ( " _q " , " " )
name_img = name_img . split ( " / " ) [ len ( name_img . split ( " / " ) ) - 1 ]
params = { " search " : name_img }
2023-04-13 22:14:30 +02:00
try :
2023-05-28 22:07:43 +02:00
page = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/media " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , params = params )
if page . status_code == 200 :
res = page . json ( )
if len ( res ) > 0 :
id_media = res [ 0 ] [ " id " ]
data = { " featured_media " : id_media }
try :
r = self . _request . post ( " {2} :// {0} /wp-json/wp/v2/posts/ {1} " . format ( self . _wordpress , result [ 0 ] [ " id " ] , self . _protocol ) , auth = self . _basic , headers = self . _headers_json , data = json . dumps ( data ) )
if r . status_code == 200 :
self . _logger . info ( " {0} : Add media featured : {1} " . format ( self . _name , r . json ( ) [ " title " ] [ " raw " ] ) )
else :
self . _logger . error ( " {0} : Connection error with status code for featured media : {1} " . format ( self . _name , r . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for post media featured : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for post media featured : {1} " . format ( self . _name , err ) )
else :
self . _logger . info ( " {0} : No media found for {1} " . format ( self . _name , h2 ) )
else :
self . _logger . error ( " {0} : Connection error with status code for search featured media: {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error search featured media : {1} " . format ( self . _name , err ) )
2023-04-13 22:14:30 +02:00
exit ( 1 )
2023-05-28 22:07:43 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error search featured media : {1} " . format ( self . _name , err ) )
2023-04-08 12:27:30 +02:00
else :
2023-05-28 22:07:43 +02:00
self . _logger . error ( " {0} : Connection error for get featured media with status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for get featured media : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for get featured media : {1} " . format ( self . _name , err ) )
else :
self . _logger . error ( " {0} : Connection error with status code for featured media : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Connection error : {1} " . format ( self . _name , err ) )
2023-04-17 23:44:09 +02:00
2023-04-08 12:27:30 +02:00
## Association image to post
def _linkImgPost ( self , title , list_img , post_id ) :
for i in list_img :
data = { " post " : post_id }
2023-04-13 22:14:30 +02:00
try :
2023-05-25 00:31:34 +02:00
r = self . _request . post ( " {2} :// {0} /wp-json/wp/v2/media/ {1} " . format ( self . _wordpress , i [ " id " ] , self . _protocol ) , auth = self . _basic , data = data )
2023-05-28 22:31:46 +02:00
if r . status_code == 200 :
self . _logger . info ( " {0} : Link image to post {1} " . format ( self . _name , title ) )
else :
self . _logger . error ( " {0} Connection error with status code for link image to post : {1} " . format ( self . _name , r . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
except ConnectionError as err :
2023-04-28 23:14:57 +02:00
self . _logger . error ( " {0} : Connection error for link image to post : {1} " . format ( self . _name , err ) )
2023-04-13 22:14:30 +02:00
exit ( 1 )
2023-05-28 22:31:46 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for link image to post : {1} " . format ( self . _name , err ) )
2023-04-20 20:48:37 +02:00
2023-04-08 12:27:30 +02:00
## Add or update img
def _addOrUpdateMedia ( self , href_img , page ) :
2023-05-23 16:46:07 +02:00
media_authorized = [ " png " , " jpg " , " jpeg " , " svg " , " gif " ]
2023-04-08 12:27:30 +02:00
media = { " id " : " " , " rendered " : " " }
split_fileimg = href_img . split ( " / " )
img_name = split_fileimg [ len ( split_fileimg ) - 1 ]
2023-05-02 16:59:31 +02:00
img_type_file = img_name . split ( " . " ) [ len ( img_name . split ( " . " ) ) - 1 ]
is_img = True
if img_type_file not in media_authorized :
self . _logger . error ( " {0} : Element {1} is not image " . format ( self . _name , img_name ) )
is_img = False
if is_img is True :
2023-05-25 00:31:34 +02:00
self . _logger . debug ( " {0} : Search for image {1} with URL {2} " . format ( self . _name , img_name , " {1} :// {0} /wp-json/wp/v2/media " . format ( self . _wordpress , self . _protocol ) ) )
2023-05-02 16:59:31 +02:00
params = { " search " : img_name }
try :
2023-05-25 00:31:34 +02:00
r = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/media " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , params = params )
2023-05-28 22:31:46 +02:00
self . _logger . debug ( " {0} : Search for image {1} and his status code {2} " . format ( self . _name , img_name , r . status_code ) )
if r . status_code == 200 :
res = r . json ( )
self . _logger . debug ( " {0} : Number of image in search : {1} " . format ( self . _name , len ( res ) ) )
if len ( res ) > 0 :
params = { " force " : 1 }
try :
r = self . _request . delete ( " {2} :// {0} /wp-json/wp/v2/media/ {1} " . format ( self . _wordpress , res [ 0 ] [ " id " ] , self . _protocol ) , auth = self . _basic , params = params )
if r . status_code == 200 :
self . _logger . info ( " {0} : Image removed {1} " . format ( self . _name , img_name ) )
else :
self . _logger . error ( " {0} : Image {1} not removed due status code : {2} " . format ( self . _name , img_name , r . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} Connection error for delete image : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} Exception error for delete image : {1} " . format ( self . _name , err ) )
data = page . content
img_type = " image/ {0} " . format ( img_type_file )
if img_type_file == " jpg " :
img_type = " image/jpeg "
headers = { ' Content-Type ' : img_type , ' Content-Disposition ' : ' attachment; filename= {0} ' . format ( img_name ) }
2023-05-02 16:59:31 +02:00
try :
2023-05-28 22:31:46 +02:00
r = self . _request . post ( " {1} :// {0} /wp-json/wp/v2/media " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , headers = headers , data = data )
if r . status_code == 201 :
self . _logger . info ( " {0} : Image added {1} " . format ( self . _name , img_name ) )
res = r . json ( )
media [ " id " ] = res [ " id " ]
media [ " rendered " ] = res [ " guid " ] [ " rendered " ]
else :
self . _logger . error ( " {0} : Image {1} . {2} not added due status code : {3} " . format ( self . _name , img_name , img_type , r . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for add image : {1} " . format ( self . _name , err ) )
exit ( 1 )
2023-05-02 16:59:31 +02:00
except Exception as err :
2023-05-28 22:31:46 +02:00
self . _logger . error ( " {0} : Exception error for add image : {1} " . format ( self . _name , err ) )
2023-05-02 16:59:31 +02:00
exit ( 1 )
2023-05-28 22:31:46 +02:00
2023-04-17 23:44:09 +02:00
else :
2023-05-28 22:31:46 +02:00
self . _logger . error ( " {0} : Connection error for search image with status code : {1} " . format ( self . _name , r . status_code ) )
2023-04-28 23:14:57 +02:00
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
2023-05-28 22:31:46 +02:00
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for search media : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for search media : {1} " . format ( self . _name , err ) )
2023-04-08 12:27:30 +02:00
return media
2023-04-11 22:30:00 +02:00
## Add or update comment
2023-04-11 23:26:40 +02:00
def _addOrUpdateComment ( self , post , comment , title ) :
2023-04-20 00:08:56 +02:00
for i in comment :
try :
params = { " post " : post , " author_name " : i [ " author " ] , " date " : i [ " date " ] }
2023-05-25 00:31:34 +02:00
page = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/comments " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , params = params )
2023-05-28 22:31:46 +02:00
if page . status_code == 200 :
result = page . json ( )
for j in result :
try :
params = { " force " : 1 }
page = self . _request . delete ( " {2} :// {0} /wp-json/wp/v2/comments/ {1} " . format ( self . _wordpress , j [ " id " ] , self . _protocol ) , params = params , auth = self . _basic )
if page . status_code == 200 :
self . _logger . info ( " {0} : Comment deleted for {1} " . format ( self . _name , title ) )
self . _logger . debug ( " {0} : Comment deleted : {1} " . format ( self . _name , j ) )
else :
self . _logger . error ( " {0} : Comment not deleted for {1} due status code : {2} " . format ( self . _name , title , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for delete comment : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for delete comment : {1} " . format ( self . _name , err ) )
else :
self . _logger . error ( " {0} : Comment not listed for {1} due status code : {2} " . format ( self . _name , title , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
2023-04-28 23:14:57 +02:00
self . _logger . error ( " {0} : Connection error for search comment : {1} " . format ( self . _name , err ) )
2023-04-20 00:08:56 +02:00
exit ( 1 )
2023-05-28 22:31:46 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for search comment : {1} " . format ( self . _name , err ) )
2023-04-20 20:48:37 +02:00
2023-04-20 00:08:56 +02:00
for i in comment :
data = { " post " : post , " content " : i [ " content " ] , " date " : i [ " date " ] , " author_name " : i [ " author " ] , " status " : " approved " }
if i [ " parent_id " ] != - 1 :
parent_id = int ( i [ " parent_id " ] )
params = { " post " : post , " author_name " : comment [ parent_id ] [ " author " ] , " date " : comment [ parent_id ] [ " date " ] }
2023-04-13 22:14:30 +02:00
try :
2023-05-25 00:31:34 +02:00
page = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/comments " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , params = params )
2023-05-28 22:31:46 +02:00
if page . status_code == 200 :
result = page . json ( )
if len ( result ) > 0 :
data [ " parent " ] = result [ 0 ] [ " id " ]
else :
self . _logger . error ( " {0} : Connection error for parent comment with status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
2023-04-28 23:14:57 +02:00
self . _logger . error ( " {0} : Connection error for parent comment : {1} " . format ( self . _name , err ) )
2023-04-13 22:14:30 +02:00
exit ( 1 )
2023-05-28 22:31:46 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for parent comment : {1} " . format ( self . _name , err ) )
2023-04-20 20:48:37 +02:00
2023-04-19 22:21:15 +02:00
try :
2023-05-25 00:31:34 +02:00
page = self . _request . post ( " {1} :// {0} /wp-json/wp/v2/comments " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , data = data )
2023-05-28 22:31:46 +02:00
if page . status_code == 201 :
self . _logger . info ( " {0} : Comment added for {1} " . format ( self . _name , title ) )
self . _logger . debug ( " {0} : Data : {1} " . format ( self . _name , data ) )
else :
self . _logger . error ( " {0} : Comment not added for {1} due status code : {2} " . format ( self . _name , title , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
2023-04-28 23:14:57 +02:00
self . _logger . error ( " {0} : Connection error for add comment : {1} " . format ( self . _name , err ) )
2023-04-19 22:21:15 +02:00
exit ( 1 )
2023-05-28 22:31:46 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for add comment : {1} " . format ( self . _name , err ) )
exit ( 1 )
2023-04-08 12:27:30 +02:00
2023-04-16 19:16:23 +02:00
## Check class name
def _hasClassName ( self , tag , className ) :
for i in tag [ " class " ] :
if i == className :
return True
return False
## Get class name
def _getClassName ( self , tag , className ) :
for i in tag [ " class " ] :
if re . match ( className , i ) :
return i
return " "
2023-04-14 23:10:07 +02:00
2023-04-16 19:16:23 +02:00
## Get all comments
2023-04-14 23:10:07 +02:00
def _getComment ( self , comment ) :
comment_post = [ ]
2023-04-16 19:32:00 +02:00
for i in range ( 0 , len ( comment ) ) :
comment_div = comment [ i ] . find ( " div " , class_ = " comment_item " )
2023-04-14 23:10:07 +02:00
comment_item = comment_div . text . split ( " \n " )
footer = comment_div . find_all ( " div " , class_ = " itemfooter " )
comment_author = footer [ 0 ] . text . split ( " , " ) [ 0 ] . replace ( " Posté par " , " " )
comment_date = footer [ 0 ] . find_all ( " abbr " ) [ 0 ] . get ( " title " )
comment_content = " <p> "
for j in range ( 0 , len ( comment_item ) - 2 ) :
if len ( comment_item [ j ] ) > 0 :
comment_content = comment_content + comment_item [ j ] + " <br /> "
comment_content = comment_content + " </p> "
parent = - 1
2023-04-16 21:06:04 +02:00
if self . _hasClassName ( comment [ i ] , " level-1 " ) is False :
2023-04-16 19:16:23 +02:00
block = False
2023-04-16 19:32:00 +02:00
className = self . _getClassName ( comment [ i ] , " level- " ) . split ( " - " )
2023-04-16 19:16:23 +02:00
level = 1
if len ( className ) > 0 :
level = int ( className [ 1 ] )
2023-04-16 21:06:04 +02:00
for j in range ( i - 1 , 0 , - 1 ) :
2023-04-16 19:16:23 +02:00
if block is False :
levelName = " level- {0} " . format ( level - 1 )
2023-04-16 21:06:04 +02:00
if self . _hasClassName ( comment [ j ] , levelName ) is True :
2023-04-16 19:16:23 +02:00
parent = j
block = True
2023-04-14 23:10:07 +02:00
2023-04-16 21:06:04 +02:00
comment_post . append ( { " author " : comment_author , " date " : comment_date , " content " : comment_content , " parent_id " : parent } )
2023-04-14 23:10:07 +02:00
return comment_post
2023-04-08 12:27:30 +02:00
## Add or Update post
def _addOrUpdatePost ( self , soup ) :
tags = [ ]
month = { " janvier " : " 01 " , " février " : " 02 " , " mars " : " 03 " , " avril " : " 04 " , " mai " : " 05 " , " juin " : " 06 " , " juillet " : " 07 " , " août " : " 08 " , " septembre " : " 09 " , " octobre " : " 10 " , " novembre " : " 11 " , " décembre " : " 12 " }
liste = [ " categories " , " tags " ]
elements = { }
element = { }
listelement = { }
for i in liste :
2023-04-18 21:50:36 +02:00
element [ i ] = [ ]
listelement [ i ] = [ ]
2023-04-08 12:27:30 +02:00
articletitle = soup . find_all ( " h2 " , class_ = " articletitle " )
2023-05-01 15:45:34 +02:00
self . _logger . debug ( " {0} : Title of the article : {1} " . format ( self . _name , articletitle ) )
2023-04-08 12:27:30 +02:00
articlebody = soup . find_all ( " div " , class_ = " articlebody " )
articledate = soup . find_all ( " span " , class_ = " articledate " )
articleacreator = soup . find_all ( " span " , class_ = " articlecreator " )
dateheader = soup . find_all ( " div " , class_ = " dateheader " )
itemfooter = soup . find_all ( " div " , class_ = " itemfooter " )
2023-04-14 23:10:07 +02:00
comment = soup . find_all ( " li " , class_ = " comment " )
2023-04-08 12:27:30 +02:00
img_a = articlebody [ 0 ] . find_all ( " a " , { " target " : " _blank " } )
2023-05-01 15:45:34 +02:00
self . _logger . debug ( " {0} : Number of image ' s link : {1} " . format ( self . _name , len ( img_a ) ) )
2023-04-08 12:27:30 +02:00
list_img = [ ]
2023-06-03 09:07:33 +02:00
if self . _no_image is False :
for i in img_a :
new_img = { }
img = i . find_all ( " img " )
self . _logger . debug ( " {0} : Number of image ' s tag : {1} " . format ( self . _name , len ( img ) ) )
if len ( img ) > 0 :
href_a = i . get ( " href " )
href_img = img [ 0 ] . get ( " src " )
2023-06-13 22:00:51 +02:00
href_a_o = urlparse ( href_a )
href_img_o = urlparse ( href_img )
2023-06-03 09:07:33 +02:00
new_img [ " old_src " ] = href_img
new_img [ " old_href " ] = href_a
try :
2023-06-13 22:00:51 +02:00
if len ( href_img_o . netloc ) > 0 :
2023-06-15 00:10:44 +02:00
img_ok = False
2023-06-13 22:00:51 +02:00
page_img = self . _request . get ( href_img )
2023-05-28 22:31:46 +02:00
2023-06-13 22:00:51 +02:00
if page_img . status_code == 404 :
href_img = href_a
try :
page_img = self . _request . get ( href_a )
2023-06-15 00:10:44 +02:00
if page_img . status_code == 200 :
img_ok = True
2023-06-13 22:00:51 +02:00
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for get image : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for get image : {1} " . format ( self . _name , err ) )
exit ( 1 )
2023-06-15 00:10:44 +02:00
2023-06-13 22:00:51 +02:00
else :
2023-06-15 00:10:44 +02:00
if os . path . exists ( " {0} /.. {1} " . format ( self . _directory , href_img ) ) :
page_img = open ( " {0} /.. {1} " . format ( self . _directory , href_img ) , " r " )
img_ok = True
else :
if os . path . exists ( " {0} /.. {1} " . format ( self . _directory , href_a ) ) :
page_img = open ( " {0} /.. {1} " . format ( self . _directory , href_a ) , " r " )
img_ok = True
2023-06-03 09:07:33 +02:00
self . _logger . debug ( " {0} : Status code for image {1} : {2} " . format ( self . _name , href_img , page_img . status_code ) )
2023-06-15 00:10:44 +02:00
if img_ok is True :
2023-06-03 09:07:33 +02:00
media = self . _addOrUpdateMedia ( href_img , page_img )
2023-05-28 22:31:46 +02:00
new_img [ " id " ] = media [ " id " ]
new_img [ " new_src " ] = media [ " rendered " ]
list_img . append ( new_img )
2023-06-03 09:07:33 +02:00
if href_img != href_a :
media = self . _addOrUpdateMedia ( href_a , page_img )
new_img [ " id " ] = media [ " id " ]
new_img [ " new_src " ] = media [ " rendered " ]
list_img . append ( new_img )
if page_img . status_code not in [ 200 , 404 ] :
self . _logger . error ( " {0} : Connection error with status code for get image : {1} " . format ( self . _name , page_img . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page_img . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for get image : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for get image : {1} " . format ( self . _name , err ) )
exit ( 1 )
2023-04-20 20:48:37 +02:00
2023-05-01 21:18:57 +02:00
self . _logger . debug ( " {0} : Number of image : {1} " . format ( self . _name , len ( list_img ) ) )
2023-04-14 23:10:07 +02:00
comment_post = self . _getComment ( comment )
2023-04-08 12:27:30 +02:00
a = itemfooter [ 0 ] . find_all ( " a " , { " rel " : True } )
for i in a :
rel = i . get ( " rel " )
if rel [ 0 ] == ' tag ' :
href = i . get ( " href " )
if re . search ( r ' /tag/ ' , href ) :
element [ " tags " ] . append ( i . text )
if re . search ( r ' /archives/ ' , href ) :
element [ " categories " ] . append ( i . text )
for i in liste :
for j in element [ i ] :
element_exist = False
2023-05-23 16:46:07 +02:00
title_element = self . _removeSpace ( j )
for index in range ( 1 , 10 ) :
self . _logger . info ( " {0} : search {1} with index {2} : {3} " . format ( self . _name , i , index , title_element ) )
2023-04-13 22:14:30 +02:00
try :
2023-05-23 16:46:07 +02:00
params = { " search " : title_element , " per_page " : " 100 " , " page " : index }
2023-05-25 00:31:34 +02:00
page = self . _request . get ( " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol ) , auth = self . _basic , params = params )
2023-05-28 22:31:46 +02:00
if page . status_code == 200 :
result = page . json ( )
self . _logger . debug ( " {0} : content {3} {2} : {1} " . format ( self . _name , result , title_element , i ) )
if len ( result ) > 0 :
for k in result :
title_rendered = k [ " name " ]
self . _logger . debug ( " {0} : content {2} : {1} " . format ( self . _name , title_rendered , i ) )
self . _logger . debug ( " {0} : size of content {3} : {2} - {1} " . format ( self . _name , len ( title_rendered ) , len ( title_element ) , i ) )
if len ( title_element ) != len ( title_rendered ) :
title_rendered = self . _replaceCaracter ( title_rendered )
if title_element == title_rendered :
self . _logger . info ( " {0} : {1} found : {2} " . format ( self . _name , i , title_rendered ) )
element_exist = True
listelement [ i ] . append ( k [ " id " ] )
else :
break
if page . status_code == 400 :
self . _logger . error ( " {0} : {1} not found due status code : {2} " . format ( self . _name , i , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
break
else :
self . _logger . error ( " {0} : {1} not found due status code : {2} " . format ( self . _name , i , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
2023-05-23 16:46:07 +02:00
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for {1} : {2} " . format ( self . _name , i , err ) )
exit ( 1 )
2023-04-13 22:14:30 +02:00
except Exception as err :
2023-05-23 16:46:07 +02:00
self . _logger . error ( " {0} : Exception error for {1} : {2} " . format ( self . _name , i , err ) )
self . _logger . debug ( " {0} : Element {3} {2} is {1} " . format ( self . _name , element_exist , title_element , i ) )
if element_exist == False :
data = { " name " : title_element }
self . _logger . info ( " {0} : Create {1} : {2} " . format ( self . _name , i , title_element ) )
self . _logger . debug ( " {0} : Data : {1} " . format ( self . _name , data ) )
try :
2023-05-25 00:31:34 +02:00
page = self . _request . post ( " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol ) , auth = self . _basic , headers = self . _headers_json , data = json . dumps ( data ) )
2023-05-28 22:31:46 +02:00
if page . status_code == 201 :
self . _logger . info ( " {0} : {1} created : {2} " . format ( self . _name , i , j ) )
result = page . json ( )
listelement [ i ] . append ( result [ " id " ] )
else :
self . _logger . error ( " {0} : {1} not added due status code : {2} " . format ( self . _name , i , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
2023-05-23 16:46:07 +02:00
except ConnectionError as err :
2023-04-28 23:14:57 +02:00
self . _logger . error ( " {0} : Connection error for post {1} : {2} " . format ( self . _name , i , err ) )
2023-04-13 22:14:30 +02:00
exit ( 1 )
2023-05-23 16:46:07 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for post {1} : {2} " . format ( self . _name , i , err ) )
2023-04-20 20:48:37 +02:00
2023-04-08 12:27:30 +02:00
title = articletitle [ 0 ] . text
author = articleacreator [ 0 ] . text . lower ( )
body = articlebody [ 0 ] . find_all ( " p " )
bodyhtml = " <p> "
for i in body :
if len ( i . text ) == 1 :
bodyhtml = bodyhtml + " <br /> "
else :
bodyhtml = bodyhtml + str ( i ) . replace ( " <p> " , " " ) . replace ( " </p> " , " " ) . replace ( " <br> " , " <br /> " ) + " <br /> "
bodyhtml = bodyhtml + " </p> "
for i in list_img :
o = urlparse ( i [ " new_src " ] )
bodyhtml = bodyhtml . replace ( i [ " old_href " ] , o . path )
bodyhtml = bodyhtml . replace ( i [ " old_src " ] , o . path )
hour = articledate [ 0 ] . text
time = dateheader [ 0 ] . text . split ( " " )
2023-05-23 16:46:07 +02:00
self . _logger . debug ( " {0} : Title post : | {1} | " . format ( self . _name , title ) )
title = self . _removeSpace ( title )
self . _logger . debug ( " {0} : Rendered Title post : | {1} | " . format ( self . _name , title ) )
2023-04-08 12:27:30 +02:00
data = { " title " : title , " content " : bodyhtml , " status " : " publish " , " date " : " {0} - {1} - {2} T {3} :00 " . format ( time [ 2 ] , month [ time [ 1 ] ] , time [ 0 ] , hour ) , " tags " : listelement [ " tags " ] , " categories " : listelement [ " categories " ] }
2023-05-23 16:46:07 +02:00
self . _logger . debug ( " {0} : Data for post : | {1} | : {2} " . format ( self . _name , title , data ) )
2023-05-21 21:12:00 +02:00
params = { " search " : author , " per_page " : 100 }
2023-04-13 22:14:30 +02:00
try :
2023-05-18 00:24:41 +02:00
self . _logger . info ( " {0} : Search author : {1} " . format ( self . _name , author ) )
2023-05-25 00:31:34 +02:00
page = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/users " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , headers = self . _headers_json , params = params )
2023-05-23 16:46:07 +02:00
self . _logger . debug ( " {0} : End Search author : {1} " . format ( self . _name , author ) )
self . _logger . debug ( " {0} : Debug requests : {1} " . format ( self . _name , page . content ) )
2023-05-28 22:31:46 +02:00
if page . status_code == 200 :
self . _logger . info ( " {0} : Get author id : {1} " . format ( self . _name , result ) )
result = page . json ( )
for a in result :
data [ " author " ] = a [ " id " ]
else :
self . _logger . error ( " {0} : Connection error with status code for get author : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( page . content ) )
2023-05-18 00:24:41 +02:00
except ConnectionError as err :
2023-04-28 23:14:57 +02:00
self . _logger . error ( " {0} : Connection error for get author : {1} " . format ( self . _name , err ) )
2023-04-13 22:14:30 +02:00
exit ( 1 )
2023-05-18 00:24:41 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for get author : {1} " . format ( self . _name , err ) )
page_is_exist = False
2023-05-21 21:12:00 +02:00
for index in range ( 1 , 10 ) :
params = { " search " : title , " per_page " : 100 , " page " : index }
try :
2023-05-23 16:46:07 +02:00
self . _logger . info ( " {0} : Search post with index {2} : {1} " . format ( self . _name , title , index ) )
2023-05-25 00:31:34 +02:00
page = self . _request . get ( " {1} :// {0} /wp-json/wp/v2/posts " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , params = params , headers = self . _headers_json )
2023-05-28 22:31:46 +02:00
if page . status_code == 200 :
self . _logger . debug ( " {0} : Encoding : {1} " . format ( self . _name , page . encoding ) )
page . encoding = " utf-8 "
result = page . json ( )
if len ( result ) == 0 :
break
self . _logger . info ( " {0} : Number result posts : {1} " . format ( self . _name , len ( result ) ) )
count = 0
for i in result :
title_rendered = i [ " title " ] [ " rendered " ]
self . _logger . info ( " {0} : Search title posts for | {2} | : | {1} | " . format ( self . _name , title_rendered , title ) )
if len ( title_rendered ) != len ( title ) :
title_rendered = self . _replaceCaracter ( title_rendered )
self . _logger . debug ( " {0} : Search title posts for | {2} | : | {1} | " . format ( self . _name , title_rendered , title ) )
self . _logger . debug ( " {0} : SIze of title : {1} - {2} " . format ( self . _name , len ( title ) , len ( title_rendered ) ) )
if title_rendered == title :
2023-06-01 15:28:48 +02:00
if self . _no_update is False :
page_is_exist = True
post_id = i [ " id " ]
count = count + 1
if count > 1 :
self . _logger . info ( " {0} : Page {1} is double and going to delete " . format ( self . _name , title ) )
try :
params = { " force " : 1 }
page = self . _request . delete ( " {2} :// {0} /wp-json/wp/v2/posts/ {1} " . format ( self . _wordpress , post_id , self . _protocol ) , auth = self . _basic , headers = self . _headers_json , params = params )
if page . status_code == 200 :
self . _logger . info ( " {0} : Post deleted : {1} " . format ( self . _name , title ) )
else :
self . _logger . error ( " {0} : Post not updated due status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for deleted post : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for deleted post : {1} " . format ( self . _name , err ) )
2023-05-28 22:31:46 +02:00
2023-06-01 15:28:48 +02:00
else :
self . _logger . debug ( " {0} : Data for post to update : {1} " . format ( self . _name , i ) )
self . _logger . info ( " {0} : Page {1} already exist and going to update " . format ( self . _name , title ) )
2023-05-28 22:31:46 +02:00
2023-06-01 15:28:48 +02:00
try :
page = self . _request . post ( " {2} :// {0} /wp-json/wp/v2/posts/ {1} " . format ( self . _wordpress , post_id , self . _protocol ) , auth = self . _basic , headers = self . _headers_json , data = json . dumps ( data ) )
2023-05-28 22:31:46 +02:00
2023-06-01 15:28:48 +02:00
if page . status_code == 200 :
result = page . json ( )
self . _logger . info ( " {0} : Post updated : {1} " . format ( self . _name , title ) )
self . _addOrUpdateComment ( result [ " id " ] , comment_post , result [ " title " ] [ " raw " ] )
self . _linkImgPost ( result [ " title " ] [ " raw " ] , list_img , result [ " id " ] )
else :
self . _logger . error ( " {0} : Post not updated due status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for update post : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for update post : {1} " . format ( self . _name , err ) )
2023-05-28 22:31:46 +02:00
if page . status_code == 400 :
self . _logger . error ( " {0} : Connection for update post unauthorized : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
break
else :
self . _logger . error ( " {0} : Connection for update post error with status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
2023-05-21 21:12:00 +02:00
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for search post : {1} " . format ( self . _name , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for search post : {1} " . format ( self . _name , err ) )
2023-04-20 20:48:37 +02:00
2023-06-01 15:28:48 +02:00
if page_is_exist is False and self . _no_create is False :
2023-05-12 00:16:58 +02:00
try :
2023-05-18 00:24:41 +02:00
self . _logger . info ( " {0} : Creating posts : {1} " . format ( self . _name , data [ " title " ] ) )
2023-05-25 00:31:34 +02:00
page = self . _request . post ( " {1} :// {0} /wp-json/wp/v2/posts " . format ( self . _wordpress , self . _protocol ) , auth = self . _basic , headers = self . _headers_json , data = json . dumps ( data ) )
2023-05-28 22:31:46 +02:00
if page . status_code == 201 :
result = page . json ( )
self . _logger . info ( " {0} : Post added : {1} " . format ( self . _name , result [ " title " ] [ " raw " ] ) )
self . _addOrUpdateComment ( result [ " id " ] , comment_post , result [ " title " ] [ " raw " ] )
self . _linkImgPost ( result [ " title " ] [ " raw " ] , list_img , result [ " id " ] )
else :
self . _logger . error ( " {0} : Post not added due status code : {1} " . format ( self . _name , r . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , r . content ) )
2023-05-18 00:24:41 +02:00
except ConnectionError as err :
2023-05-12 00:16:58 +02:00
self . _logger . error ( " {0} : Connection error for create post : {1} " . format ( self . _name , err ) )
exit ( 1 )
2023-05-18 00:24:41 +02:00
except Exception as err :
self . _logger . error ( " {0} : Exception error for create post : {1} " . format ( self . _name , err ) )