separate article and page

This commit is contained in:
Valentin CZERYBA 2023-05-07 09:26:48 +02:00
parent 21d2f35e6e
commit f9be6770e3
3 changed files with 26 additions and 18 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
backup*/ backup*/
wp-navigation wp-navigation
web_scrap.log *.log
__pycache__/ __pycache__/

View File

@ -3,23 +3,27 @@ from requests.auth import HTTPBasicAuth
from getpass import getpass from getpass import getpass
from urllib.parse import urlparse from urllib.parse import urlparse
from concurrent import futures from concurrent import futures
from concurrent.futures import as_completed, wait
import argparse, logging, threading import argparse, logging, threading
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
def download(name_thread, max_thread, url, logger, parser, directory, html, img): def download(name_thread, max_thread, url, logger, parser, directory, html, img):
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
exportWp = WPExport(name="Thread-{0}".format(int(name_thread) + 1), url=url, logger=logger, parser=parser, directory=directory)
webpage = exportWp.getUrlPage(name_thread, max_thread) webpage = exportWp.getUrlPage(name_thread, max_thread)
if html is False: if html is False:
exportWp.downloadHTML(webpage) exportWp.downloadHTML(webpage["article"])
exportWp.downloadHTML(webpage["page"])
if args.img is False:
exportWp.downloadImg(webpage) if img is False:
del exportWp exportWp.downloadImg(webpage["article"])
exportWp.downloadImg(webpage["page"])
def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial): def importUrl(name_thread, max_thread, canalblog, logger, parser, wordpress, basic, serial):
canalblog = canalblog.split(",") canalblog = canalblog.split(",")
@ -198,6 +202,8 @@ if __name__ == '__main__':
if args.css is False: if args.css is False:
exportWp.downloadCss() exportWp.downloadCss()
del exportWp del exportWp
if args.html is False or args.img is False: if args.html is False or args.img is False:
try: try:

View File

@ -13,6 +13,7 @@ class WPExport:
self._dir = directory self._dir = directory
self._name = name self._name = name
self._request = requests.Session() self._request = requests.Session()
retries = Retry(total=10, retries = Retry(total=10,
@ -27,6 +28,7 @@ class WPExport:
# Public method # Public method
# Set name # Set name
def setName(self, name): def setName(self, name):
@ -83,8 +85,8 @@ class WPExport:
self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content)) self._logger.debug("{0} : {1}".format(self._name, page.content))
webpage = {"page":[], "article":[]}
webpage = []
for i in page_url: for i in page_url:
try: try:
page = self._request.get(i) page = self._request.get(i)
@ -93,8 +95,8 @@ class WPExport:
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("{0} : page : {1}".format(self._name, i)) self._logger.info("{0} : page : {1}".format(self._name, i))
if i not in webpage: if i not in webpage["page"]:
webpage.append(i) webpage["page"].append(i)
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline") class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
if len(class_div) > 0: if len(class_div) > 0:
@ -120,27 +122,27 @@ class WPExport:
if len(categorie) > 2: if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging)
self._logger.info("{0} : {1}".format(self._name, url_paging)) self._logger.info("{0} : {1}".format(self._name, url_paging))
if url_paging not in webpage: if url_paging not in webpage["page"]:
webpage.append(url_paging) webpage["page"].append(url_paging)
page = self._request.get(url_paging) page = self._request.get(url_paging)
if page.status_code == 200: if page.status_code == 200:
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
h2 = soup.find_all("h2") h2 = soup.find_all("h2")
for title in h2: for title in h2:
href = title.find_all("a")[0].get("href", "/") href = title.find_all("a")[0].get("href", "/")
if href not in webpage: if href not in webpage["article"]:
try: try:
o = urlparse(href) o = urlparse(href)
o = o._replace(scheme="https").geturl() o = o._replace(scheme="https").geturl()
except Exception as err: except Exception as err:
self._logger.error("parsing error : {0}".format(err)) self._logger.error("parsing error : {0}".format(err))
exit(1) exit(1)
webpage.append(o) webpage["article"].append(o)
else: else:
self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code))
self._logger.debug(page.content) self._logger.debug("{0} : {1}".format(self._name, page.content))
return webpage return webpage
# Private method # Private method