2023-06-30 23:28:54 +02:00
#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib . parse import urlparse
import requests , os , logging , re , json
from requests . adapters import HTTPAdapter
from requests . packages . urllib3 . util . retry import Retry
class WPMenu :
# Constructor
2023-06-30 23:52:56 +02:00
def __init__ ( self , name = " Thread-0 " , basic = None , canalblog = " " , wordpress = " " , logger = None , parser = " html.parser " , ssl_canalblog = True , ssl_wordpress = True ) :
2023-06-30 23:28:54 +02:00
self . _name = name
self . _basic = basic
2023-06-30 23:52:56 +02:00
self . _canalblog = canalblog
2023-06-30 23:28:54 +02:00
self . _wordpress = wordpress
self . _logger = logger
self . _parser = parser
self . _headers_json = { ' Content-Type ' : ' application/json; charset=utf-8 ' , ' Accept ' : ' application/json ' }
2023-07-01 22:16:35 +02:00
self . _protocol_wordpress = " https "
self . _protocol_canalblog = " https "
2023-06-30 23:28:54 +02:00
self . _directory = " backup "
2023-06-30 23:52:56 +02:00
2023-06-30 23:28:54 +02:00
if ssl_wordpress is False :
2023-06-30 23:52:56 +02:00
self . _protocol_wordpress = " http "
if ssl_canalblog is False :
self . _protocol_canalblog = " http "
2023-07-01 22:16:35 +02:00
self . _request_canalblog = requests . Session ( )
self . _request_wordpress = requests . Session ( )
2023-06-30 23:28:54 +02:00
retries = Retry ( connect = 10 , read = 10 , redirect = 5 ,
status_forcelist = [ 429 , 500 , 502 , 503 , 504 ] , backoff_factor = 2 )
2023-07-01 22:16:35 +02:00
self . _request_canalblog . mount ( ' {0} :// ' . format ( self . _protocol_canalblog ) , HTTPAdapter ( max_retries = retries ) )
self . _request_wordpress . mount ( ' {0} :// ' . format ( self . _protocol_wordpress ) , HTTPAdapter ( max_retries = retries ) )
2023-06-30 23:28:54 +02:00
# Destructor
def __del__ ( self ) :
2023-06-30 23:52:56 +02:00
print ( " {0} : Import finished for {1} " . format ( self . _name , self . _wordpress ) )
2023-07-01 21:56:10 +02:00
# Public method
## From file
def fromFile ( self , files ) :
if os . path . exists ( files ) :
with open ( files , ' r ' ) as f :
self . _logger . info ( " {0} : File is being processed : {1} " . format ( self . _name , files ) )
content = f . read ( )
self . _menu ( content )
else :
self . _logger . error ( " {0} : File isn ' t exist : {1} " . format ( self . _name , files ) )
2023-06-30 23:52:56 +02:00
## Get from URL
2023-07-01 21:56:10 +02:00
def fromUrl ( self , canalblog ) :
2023-06-30 23:52:56 +02:00
try :
2023-07-01 21:56:10 +02:00
o = urlparse ( canalblog )
2023-06-30 23:52:56 +02:00
o = o . _replace ( scheme = self . _protocol_canalblog )
i = o . geturl ( ) . replace ( " :/// " , " :// " )
2023-07-01 22:16:35 +02:00
page = self . _request_canalblog . get ( i )
2023-06-30 23:52:56 +02:00
if page . status_code == 200 :
2023-07-01 21:56:10 +02:00
self . _logger . info ( " {0} : Page web is being processed : {1} " . format ( self . _name , i ) )
self . _menu ( page . content )
2023-06-30 23:52:56 +02:00
else :
self . _logger . error ( " {0} : index didn ' t get due status code : {1} " . format ( self . _name , page . status_code ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
2023-07-01 22:16:35 +02:00
self . _logger . error ( " {0} : Connection error for get url {1} : {2} " . format ( self . _name , canalblog , err ) )
2023-06-30 23:52:56 +02:00
exit ( 1 )
except Exception as err :
2023-07-01 22:16:35 +02:00
self . _logger . error ( " {0} : Exception error for get url {1} : {2} " . format ( self . _name , canalblog , err ) )
2023-07-01 21:56:10 +02:00
def _menu ( self , content ) :
2023-07-01 22:16:35 +02:00
soup = BeautifulSoup ( content , self . _parser )
2023-07-01 21:56:10 +02:00
ul = soup . find_all ( " ul " , id = " listsmooth " )
2023-07-04 00:26:27 +02:00
menu = list ( )
2023-07-06 00:42:47 +02:00
parents = list ( )
2023-07-01 21:56:10 +02:00
for anchor in ul [ 0 ] . find_all ( " li " ) :
2023-07-06 00:42:47 +02:00
parent = anchor . find ( " a " ) . get_text ( )
href = anchor . find ( " a " ) . get ( " href " )
if href == " # " :
li = anchor . find_all ( " li " )
for child in li :
a = child . find ( " a " )
self . _logger . info ( " {0} {1} : {2} " . format ( self . _name , anchor . find ( " a " ) . get_text ( ) , a . get_text ( ) ) )
for i in [ " categories " , " tags " ] :
try :
params = { " search " : a . get_text ( ) , " per_page " : " 100 " }
page = self . _request_wordpress . get ( " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , auth = self . _basic , params = params )
if page . status_code == 200 :
result = page . json ( )
if len ( result ) > 0 :
menu . append ( { " id " : result [ 0 ] [ " id " ] , " type " : i , " title " : a . get_text ( ) , " parent " : parent } )
else :
self . _logger . error ( " {0} : {2} didn ' t get due status code : {1} " . format ( self . _name , page . status_code , i ) )
self . _logger . debug ( " {0} : {1} " . format ( self . _name , page . content ) )
except ConnectionError as err :
self . _logger . error ( " {0} : Connection error for get url {1} : {2} " . format ( self . _name , " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , err ) )
exit ( 1 )
except Exception as err :
self . _logger . error ( " {0} : Exception error for get url {1} : {2} " . format ( self . _name , " {2} :// {0} /wp-json/wp/v2/ {1} " . format ( self . _wordpress , i , self . _protocol_wordpress ) , err ) )
else :
self . _logger . info ( " {0} : Parent title : {1} " . format ( self . _name , parent ) )
parents . append ( parent )
for i in range ( 0 , len ( menu ) - 1 ) :
self . _logger . info ( " {0} : Menu : {1} {2} " . format ( self . _name , menu [ i ] , len ( parents ) ) )
for j in range ( 0 , len ( parents ) - 1 ) :
self . _logger . info ( " {0} : Comparaison : {1} {2} " . format ( self . _name , menu [ i ] [ " title " ] , parents [ j ] ) )
if menu [ i ] [ " title " ] == parents [ j ] :
self . _logger . info ( " {0} del : {1} " . format ( self . _name , parents [ j ] ) )
del parents [ j ]
for i in parents :
self . _logger . info ( " {0} : Parent : {1} " . format ( self . _name , i ) )