Compare commits

..

No commits in common. "aaac2385a3cdbc2fb5cc275c27020ec8cb522ba9" and "a39e2200bd8948987c40be3b47231d99c913a4ec" have entirely different histories.

3 changed files with 18 additions and 38 deletions

1
.gitignore vendored
View File

@ -1,5 +1,4 @@
backup/ backup/
backup1/ backup1/
backup2/
web_scrap.log web_scrap.log
__pycache__/ __pycache__/

View File

@ -3,14 +3,13 @@ from requests.auth import HTTPBasicAuth
from getpass import getpass from getpass import getpass
from urllib.parse import urlparse from urllib.parse import urlparse
import argparse, logging, threading import argparse, logging, threading
import multiprocessing
from lib.WPImport import WPimport from lib.WPImport import WPimport
from lib.WPExport import WPExport from lib.WPExport import WPExport
def download(name_thread, max_thread, exportWp, html, img): def download(exportWp, html, img):
exportWp.setName(name_thread) webpage = exportWp.getUrlPage()
webpage = exportWp.getUrlPage(name_thread, max_thread)
if html is False: if html is False:
exportWp.downloadHTML(webpage) exportWp.downloadHTML(webpage)
@ -26,7 +25,6 @@ if __name__ == '__main__':
parser.add_argument("--logfile", help="Log file", default="") parser.add_argument("--logfile", help="Log file", default="")
parser.add_argument("--quiet", help="No console output", action="store_true") parser.add_argument("--quiet", help="No console output", action="store_true")
parser.add_argument("--parser", help="Parser content", default="html.parser") parser.add_argument("--parser", help="Parser content", default="html.parser")
parser.add_argument("--parallel", help="Define number thread (default : 1)", default=1)
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
@ -164,12 +162,10 @@ if __name__ == '__main__':
exportWp.downloadCss() exportWp.downloadCss()
if args.html is False or args.img is False: if args.html is False or args.img is False:
threads = [] webpage = exportWp.getUrlPage()
for i in range(0, int(args.parallel)): if args.html is False:
t1 = multiprocessing.Process(name="Process-{0}".format(i + 1), target=download, args=(i, 3, exportWp,args.html, args.img)) exportWp.downloadHTML(webpage)
threads.append(t1)
for thread in threads: if args.img is False:
thread.start() exportWp.downloadImg(webpage)
thread.join()
exit(0) exit(0)

View File

@ -11,7 +11,6 @@ class WPExport:
self._logger = logger self._logger = logger
self._parser = parser self._parser = parser
self._dir = directory self._dir = directory
self._name = "Process-0"
self._request = requests.Session() self._request = requests.Session()
@ -23,11 +22,6 @@ class WPExport:
# Public method # Public method
# Set name
def setName(self, name):
self._name = "Process-{}".format(int(name) + 1)
# Set URL # Set URL
def setUrl(self, url): def setUrl(self, url):
@ -61,11 +55,11 @@ class WPExport:
# Get URL # Get URL
def getUrlPage(self, index_thread, max_thread): def getUrlPage(self):
try: try:
page = self._request.get(self._url) page = self._request.get(self._url)
except Exception as err: except Exception as err:
self._logger.error("{0} : Connection error : {1}".format(self._name, err)) self._logger.error("Connection error : {0}".format(err))
exit(1) exit(1)
page_url = [] page_url = []
if page.status_code == 200: if page.status_code == 200:
@ -76,8 +70,8 @@ class WPExport:
if href != "#": if href != "#":
page_url.append(href) page_url.append(href)
else: else:
self._logger.error("{0} : URL did not get due status code : {1}".format(self._name, page.status_code)) self._logger.error("Url did not get due status code : {0}".format(page.status_code))
self._logger.debug("{0} : {1}".format(self._name, page.content)) self._logger.debug(page.content)
webpage = [] webpage = []
@ -85,10 +79,10 @@ class WPExport:
try: try:
page = self._request.get(i) page = self._request.get(i)
except Exception as err: except Exception as err:
self._logger.error("{0} : Connection error : {1}".format(self._name, err)) self._logger.error("Connection error : {0}".format(err))
exit(1) exit(1)
if page.status_code == 200: if page.status_code == 200:
self._logger.info("{0} : page : {1}".format(self._name, i)) self._logger.info("page : {0}".format(i))
if i not in webpage: if i not in webpage:
webpage.append(i) webpage.append(i)
soup = BeautifulSoup(page.text, self._parser) soup = BeautifulSoup(page.text, self._parser)
@ -100,22 +94,13 @@ class WPExport:
element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1] element_lastpage = lastpage.split("/")[len(lastpage.split("/"))-1]
number_page = element_lastpage.split("-")[0].split("p")[1] number_page = element_lastpage.split("-")[0].split("p")[1]
number_lastpage = int(number_page) / 10 number_lastpage = int(number_page) / 10
for j in range(1,int(number_lastpage)):
setPageDivided = int(number_lastpage) / max_thread
setPagePart = setPageDivided * (index_thread + 1)
firstPagePart = (setPagePart - setPageDivided) + 1
self._logger.debug("{0} : Total page : {1}".format(self._name,int(number_lastpage)))
self._logger.debug("{0} : First range : {1}".format(self._name, int(firstPagePart)))
self._logger.debug("{0} : Last range : {1}".format(self._name, int(setPagePart)))
for j in range(int(firstPagePart),int(setPagePart)):
paging = j * 10 paging = j * 10
categorie = urlparse(i).path.split("/") categorie = urlparse(i).path.split("/")
url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging) url_paging = "{0}/archives/p{1}-10.html".format(self._url, paging)
if len(categorie) > 2: if len(categorie) > 2:
url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging) url_paging = "{0}/archives/{1}/p{2}-10.html".format(self._url, categorie[2], paging)
self._logger.info("{0} : {1}".format(self._name, url_paging)) self._logger.info(url_paging)
if url_paging not in webpage: if url_paging not in webpage:
webpage.append(url_paging) webpage.append(url_paging)
page = self._request.get(url_paging) page = self._request.get(url_paging)
@ -133,7 +118,7 @@ class WPExport:
exit(1) exit(1)
webpage.append(o) webpage.append(o)
else: else:
self._logger.error("{0} : web didn't get due status code : {1}".format(self._name, page.status_code)) self._logger.error("web didn't get due status code : {0}".format(page.status_code))
self._logger.debug(page.content) self._logger.debug(page.content)
return webpage return webpage