add parameter parser
This commit is contained in:
parent
cd6b03b0ff
commit
9ed08ea964
13
WPExport.py
13
WPExport.py
@ -4,9 +4,10 @@ from urllib.parse import urlparse
|
||||
import requests, os, argparse, logging
|
||||
|
||||
class WPExport:
|
||||
def __init__(self, url, logger):
|
||||
def __init__(self, url, logger, parser):
|
||||
self._url = url
|
||||
self._logger = logger
|
||||
self._parser = parser
|
||||
|
||||
def _mkdirPath(self, path_dir, logger):
|
||||
if not os.path.exists(path_dir):
|
||||
@ -34,7 +35,7 @@ class WPExport:
|
||||
exit(1)
|
||||
page_url = []
|
||||
if page.status_code == 200:
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
soup = BeautifulSoup(page.text, self._parser)
|
||||
if js is True:
|
||||
script = soup.find_all("script")
|
||||
for anchor in script:
|
||||
@ -78,7 +79,7 @@ class WPExport:
|
||||
self._logger.error("Connection error : {0}".format(err))
|
||||
exit(1)
|
||||
if page.status_code == 200:
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
soup = BeautifulSoup(page.text, self._parser)
|
||||
img = soup.find_all("img")
|
||||
self._logger.info("image from page: {0} : ".format(i))
|
||||
for anchor in img:
|
||||
@ -97,7 +98,7 @@ class WPExport:
|
||||
exit(1)
|
||||
page_url = []
|
||||
if page.status_code == 200:
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
soup = BeautifulSoup(page.text, self._parser)
|
||||
ul = soup.find_all("ul", id="listsmooth")
|
||||
for anchor in ul[0].find_all("a"):
|
||||
href = anchor.get('href', '/')
|
||||
@ -115,7 +116,7 @@ class WPExport:
|
||||
self._logger.info("page : {0}".format(i))
|
||||
if i not in webpage:
|
||||
webpage.append(i)
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
soup = BeautifulSoup(page.text, self._parser)
|
||||
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
|
||||
if len(class_div) > 0:
|
||||
pagingfirstline = class_div[0].find_all("a")
|
||||
@ -135,7 +136,7 @@ class WPExport:
|
||||
webpage.append(url_paging)
|
||||
page = requests.get(url_paging)
|
||||
if page.status_code == 200:
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
soup = BeautifulSoup(page.text, self._parser)
|
||||
h2 = soup.find_all("h2")
|
||||
for title in h2:
|
||||
href = title.find_all("a")[0].get("href", "/")
|
||||
|
Loading…
x
Reference in New Issue
Block a user