add parameter parser
This commit is contained in:
parent
cd6b03b0ff
commit
9ed08ea964
13
WPExport.py
13
WPExport.py
@ -4,9 +4,10 @@ from urllib.parse import urlparse
|
|||||||
import requests, os, argparse, logging
|
import requests, os, argparse, logging
|
||||||
|
|
||||||
class WPExport:
|
class WPExport:
|
||||||
def __init__(self, url, logger):
|
def __init__(self, url, logger, parser):
|
||||||
self._url = url
|
self._url = url
|
||||||
self._logger = logger
|
self._logger = logger
|
||||||
|
self._parser = parser
|
||||||
|
|
||||||
def _mkdirPath(self, path_dir, logger):
|
def _mkdirPath(self, path_dir, logger):
|
||||||
if not os.path.exists(path_dir):
|
if not os.path.exists(path_dir):
|
||||||
@ -34,7 +35,7 @@ class WPExport:
|
|||||||
exit(1)
|
exit(1)
|
||||||
page_url = []
|
page_url = []
|
||||||
if page.status_code == 200:
|
if page.status_code == 200:
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, self._parser)
|
||||||
if js is True:
|
if js is True:
|
||||||
script = soup.find_all("script")
|
script = soup.find_all("script")
|
||||||
for anchor in script:
|
for anchor in script:
|
||||||
@ -78,7 +79,7 @@ class WPExport:
|
|||||||
self._logger.error("Connection error : {0}".format(err))
|
self._logger.error("Connection error : {0}".format(err))
|
||||||
exit(1)
|
exit(1)
|
||||||
if page.status_code == 200:
|
if page.status_code == 200:
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, self._parser)
|
||||||
img = soup.find_all("img")
|
img = soup.find_all("img")
|
||||||
self._logger.info("image from page: {0} : ".format(i))
|
self._logger.info("image from page: {0} : ".format(i))
|
||||||
for anchor in img:
|
for anchor in img:
|
||||||
@ -97,7 +98,7 @@ class WPExport:
|
|||||||
exit(1)
|
exit(1)
|
||||||
page_url = []
|
page_url = []
|
||||||
if page.status_code == 200:
|
if page.status_code == 200:
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, self._parser)
|
||||||
ul = soup.find_all("ul", id="listsmooth")
|
ul = soup.find_all("ul", id="listsmooth")
|
||||||
for anchor in ul[0].find_all("a"):
|
for anchor in ul[0].find_all("a"):
|
||||||
href = anchor.get('href', '/')
|
href = anchor.get('href', '/')
|
||||||
@ -115,7 +116,7 @@ class WPExport:
|
|||||||
self._logger.info("page : {0}".format(i))
|
self._logger.info("page : {0}".format(i))
|
||||||
if i not in webpage:
|
if i not in webpage:
|
||||||
webpage.append(i)
|
webpage.append(i)
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, self._parser)
|
||||||
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
|
class_div = pagingfirstline = soup.find_all("div", class_="pagingfirstline")
|
||||||
if len(class_div) > 0:
|
if len(class_div) > 0:
|
||||||
pagingfirstline = class_div[0].find_all("a")
|
pagingfirstline = class_div[0].find_all("a")
|
||||||
@ -135,7 +136,7 @@ class WPExport:
|
|||||||
webpage.append(url_paging)
|
webpage.append(url_paging)
|
||||||
page = requests.get(url_paging)
|
page = requests.get(url_paging)
|
||||||
if page.status_code == 200:
|
if page.status_code == 200:
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, self._parser)
|
||||||
h2 = soup.find_all("h2")
|
h2 = soup.find_all("h2")
|
||||||
for title in h2:
|
for title in h2:
|
||||||
href = title.find_all("a")[0].get("href", "/")
|
href = title.find_all("a")[0].get("href", "/")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user