2023-06-30 23:28:54 +02:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib . parse import urlparse
import requests , os , logging , re , json
from requests . adapters import HTTPAdapter
from requests . packages . urllib3 . util . retry import Retry
class WPMenu :
# Constructor
2023-06-30 23:52:56 +02:00
def __init__ ( self , name = " Thread-0 " , basic = None , canalblog = " " , wordpress = " " , logger = None , parser = " html.parser " , ssl_canalblog = True , ssl_wordpress = True ) :
2023-06-30 23:28:54 +02:00
self . _name = name
self . _basic = basic
2023-06-30 23:52:56 +02:00
self . _canalblog = canalblog
2023-06-30 23:28:54 +02:00
self . _wordpress = wordpress
self . _logger = logger
self . _parser = parser
self . _headers_json = { ' Content-Type ' : ' application/json; charset=utf-8 ' , ' Accept ' : ' application/json ' }
2023-07-01 22:16:35 +02:00
self . _protocol_wordpress = " https "
self . _protocol_canalblog = " https "
2023-06-30 23:28:54 +02:00
self . _directory = " backup "
2023-06-30 23:52:56 +02:00
2023-06-30 23:28:54 +02:00
if ssl_wordpress is False :
2023-06-30 23:52:56 +02:00
self . _protocol_wordpress = " http "
if ssl_canalblog is False :
self . _protocol_canalblog = " http "
2023-07-01 22:16:35 +02:00
self . _request_canalblog = requests . Session ( )
self . _request_wordpress = requests . Session ( )
2023-06-30 23:28:54 +02:00
retries = Retry ( connect = 10 , read = 10 , redirect = 5 ,
status_forcelist = [ 429 , 500 , 502 , 503 , 504 ] , backoff_factor = 2 )
2023-07-01 22:16:35 +02:00
self . _request_canalblog . mount ( ' {0} :// ' . format ( self . _protocol_canalblog ) , HTTPAdapter ( max_retries = retries ) )
self . _request_wordpress . mount ( ' {0} :// ' . format ( self . _protocol_wordpress ) , HTTPAdapter ( max_retries = retries ) )
2023-06-30 23:28:54 +02:00
# Destructor
def __del__ ( self ) :
2023-06-30 23:52:56 +02:00
print ( " {0} : Import finished for {1} " . format ( self . _name , self . _wordpress ) )
2023-07-01 21:56:10 +02:00
# Public method
## From file
def fromFile ( self , files ) :
if os . path . exists ( files ) :
with open ( files , ' r ' ) as f :
self . _logger . info ( " {0} : File is being processed : {1} " . format ( self . _name , files ) )
content = f . read ( )
self . _menu ( content )
else :
self . _logger . error ( " {0} : File isn ' t exist : {1} " . format ( self . _name , files ) )
2023-06-30 23:52:56 +02:00
## Get from URL
2023-07-01 21:56:10 +02:00
def fromUrl ( self , canalblog ) :
2023-07-11 00:17:24 +02:00
self . _canalblog = canalblog
2023-06-30 23:52:56 +02:00
try :
2023-07-01 21:56:10 +02:00
o = urlparse ( canalblog )
2023-06-30 23:52:56 +02:00
o = o . _replace ( scheme = self . _protocol_canalblog )
i = o . geturl ( ) . replace ( " :/// " , " :// " )
2023-07-01 22:16:35 +02:00
page = self . _request_canalblog . get ( i )
2023-06-30 23:52:56 +02:00
if page . status_code == 200 :
2023-07-01 21:56:10 +02:00
self . _logger . info ( " {0} : Page web is being processed : {1} " . format ( self . _name , i ) )
self . _menu ( page . content )
2023-06-30 23:52:56 +02:00
else :
self . _logger . error ( " {0} : index didn ' t get due status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
2023-07-01 22:16:35 +02:00
self . _logger . error ( " {0} : Connection error for get url {1} : {2} " . format ( self . _name , canalblog , err ) )
2023-06-30 23:52:56 +02:00
exit ( 1 )
except Exception as err :
2023-07-13 23:53:54 +02:00
self . _logger . error ( " {0} : Exception error for get url {1} : {2} " . format ( self . _name , canalblog , err ) )
def _getIdFromPost ( self , href ) :
idMenu = 0
return idMenu
def _getIdFromReverse ( self , title , href ) :
self . _logger . info ( " {0} : get title {1} from href {2} " . format ( self . _name , title , href ) )
idMenu = 0
if href != " # " :
title = href [ : : - 1 ]
link = title . split ( " / " ) [ 0 ]
link = link [ : : - 1 ]
title = title . split ( " / " ) [ 1 ]
title = title [ : : - 1 ]
self . _logger . info ( " {0} link {1} title {2} " . format ( self . _name , link , title ) )
if link == " index.html " :
idMenu = self . _getId ( title )
return idMenu
2023-07-01 21:56:10 +02:00
2023-07-11 21:42:50 +02:00
def _getId ( self , title ) :
idMenu = 0
2023-07-09 15:27:20 +02:00
exist = False
for i in [ " categories " , " tags " ] :
2023-07-13 00:25:14 +02:00
for index in range ( 1 , 10 ) :
if exist is False :
try :
params = { " search " : title , " per_page " : " 100 " , " page " : index }
2023-07-13 22:35:38 +02:00
self . _logger . info ( " {0} Get menu {1} {2} {3} " . format ( self . _name , " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , index , title ) )
2023-07-13 00:25:14 +02:00
page = self . _request_wordpress . get ( " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , auth = self . _basic , params = params )
if page . status_code == 200 :
result = page . json ( )
if len ( result ) > 0 :
for j in result :
2023-07-13 22:35:38 +02:00
self . _logger . info ( " {0} info : {1} {2} {3} " . format ( self . _name , j [ " name " ] , j [ " slug " ] , title ) )
if j [ " name " ] == title or j [ " slug " ] == title :
self . _logger . info ( " {0} : comparaison ok : {1} {2} " . format ( self . _name , j [ " id " ] , i ) )
2023-07-13 00:25:14 +02:00
idMenu = j [ " id " ]
exist = True
else :
self . _logger . error ( " {0} : {2} didn ' t get due status code : {1} " . format ( self . _name , page . status_code , i ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for get url {1} : {2} " . format ( self . _name , " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for get url {1} : {2} " . format ( self . _name , " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , err ) )
2023-07-11 21:42:50 +02:00
return idMenu
2023-07-01 21:56:10 +02:00
def _menu ( self , content ) :
2023-07-01 22:16:35 +02:00
soup = BeautifulSoup ( content , self . _parser )
2023-07-09 15:27:20 +02:00
ul = soup . find ( " ul " , id = " listsmooth " )
2023-07-09 19:11:05 +02:00
menu = list ( )
2023-07-09 21:42:09 +02:00
children = list ( )
2023-07-09 15:27:20 +02:00
for anchor in ul . find_all ( " li " ) :
2023-07-09 19:11:05 +02:00
parent = anchor . find ( " a " ) . get_text ( ) . replace ( " \xa0 " , " " )
2023-07-06 00:42:47 +02:00
href = anchor . find ( " a " ) . get ( " href " )
2023-07-11 00:17:24 +02:00
if href == " {0} :// {1} / " . format ( self . _protocol_canalblog , self . _canalblog ) :
2023-07-09 21:42:09 +02:00
parent = " home "
2023-07-13 22:35:38 +02:00
itemMenu = { " id " : " " , " type " : " " , " title " : parent , " href " : href , " children " : list ( ) }
2023-07-06 00:42:47 +02:00
if href == " # " :
2023-07-09 15:27:20 +02:00
li = anchor . find ( " ul " ) . find_all ( " li " )
2023-07-06 00:42:47 +02:00
for child in li :
a = child . find ( " a " )
2023-07-13 22:35:38 +02:00
href = a . get ( " href " )
2023-07-09 19:11:05 +02:00
self . _logger . info ( " {0} Parent {1} : Child {2} " . format ( self . _name , parent , a . get_text ( ) ) )
2023-07-13 22:35:38 +02:00
children . append ( { " title " : a . get_text ( ) , " parent " : parent , " href " : href } )
2023-07-09 19:11:05 +02:00
menu . append ( itemMenu )
2023-07-06 00:42:47 +02:00
2023-07-11 21:39:41 +02:00
for i in range ( 0 , len ( children ) ) :
self . _logger . info ( " {0} : Child {1} {2} " . format ( self . _name , children [ i ] , i ) )
for j in range ( 0 , len ( menu ) ) :
if j < len ( menu ) :
if menu [ j ] [ " title " ] == children [ i ] [ " title " ] :
self . _logger . info ( " {0} : Parent {1} {2} " . format ( self . _name , menu [ j ] , j ) )
del menu [ j ]
for j in range ( 0 , len ( menu ) ) :
self . _logger . info ( " {0} : Children for : {1} " . format ( self . _name , menu [ j ] [ " title " ] ) )
2023-07-09 21:42:09 +02:00
if menu [ j ] [ " title " ] == children [ i ] [ " parent " ] :
2023-07-13 23:53:54 +02:00
menu [ j ] [ " children " ] . append ( { " id " : " " , " title " : children [ i ] [ " title " ] , " parent " : children [ i ] [ " parent " ] , " href " : children [ i ] [ " href " ] } )
2023-07-09 21:42:09 +02:00
2023-07-11 21:42:50 +02:00
for i in range ( 0 , len ( menu ) ) :
self . _logger . info ( " {0} : Menu {1} {2} " . format ( self . _name , menu [ i ] [ " title " ] , len ( menu [ i ] [ " children " ] ) ) )
2023-07-13 00:09:18 +02:00
if menu [ i ] [ " title " ] != " home " :
for j in range ( 0 , len ( menu [ i ] [ " children " ] ) ) :
idMenu = self . _getId ( menu [ i ] [ " children " ] [ j ] [ " title " ] )
2023-07-13 22:35:38 +02:00
if idMenu == 0 :
2023-07-13 23:53:54 +02:00
self . _logger . debug ( " {0} : content children {1} " . format ( self . _name , menu [ i ] [ " children " ] [ j ] ) )
idMenu = self . _getIdFromReverse ( menu [ i ] [ " children " ] [ j ] [ " title " ] , menu [ i ] [ " children " ] [ j ] [ " href " ] )
2023-07-13 00:09:18 +02:00
if idMenu != 0 :
menu [ i ] [ " children " ] [ j ] = { " id " : idMenu , " title " : menu [ i ] [ " children " ] [ j ] [ " title " ] , " parent " : menu [ i ] [ " children " ] [ j ] [ " parent " ] }
idMenu = self . _getId ( menu [ i ] [ " title " ] )
2023-07-13 23:53:54 +02:00
self . _logger . debug ( " {0} : content parent {1} " . format ( self . _name , menu [ i ] ) )
if idMenu == 0 :
idMenu = self . _getIdFromReverse ( menu [ i ] [ " title " ] , menu [ i ] [ " href " ] )
2023-07-13 00:09:18 +02:00
if idMenu != 0 :
menu [ i ] = { " id " : idMenu , " title " : menu [ i ] [ " title " ] , " children " : menu [ i ] [ " children " ] }
for i in menu :
self . _logger . info ( " {0} Menu parent {1} with id : {2} " . format ( self . _name , i [ " title " ] , i [ " id " ] ) )
for j in i [ " children " ] :
self . _logger . info ( " {0} Menu children {1} with id : {2} " . format ( self . _name , j [ " title " ] , j [ " id " ] ) )
2023-07-11 21:42:50 +02:00