|
| 1 | +import requests |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import os |
| 4 | +import argparse |
| 5 | +from urllib.parse import urljoin |
| 6 | +from threading import Thread, Lock |
| 7 | +import queue |
| 8 | + |
| 9 | +# Global queue for URLs to be explored and lock for thread-safe operations |
| 10 | +url_queue = queue.Queue() |
| 11 | +url_lock = Lock() |
| 12 | + |
| 13 | +# Function to download a video file |
| 14 | +def download_video(url, folder_path, downloaded_urls): |
| 15 | + if url in downloaded_urls: |
| 16 | + print(f"Already downloaded: {url}") |
| 17 | + return |
| 18 | + try: |
| 19 | + print(f"Attempting to download: {url}") |
| 20 | + response = requests.get(url, stream=True) |
| 21 | + response.raise_for_status() # Check for HTTP errors |
| 22 | + file_name = url.split('/')[-1] |
| 23 | + with open(os.path.join(folder_path, file_name), 'wb') as file: |
| 24 | + for chunk in response.iter_content(chunk_size=1024): |
| 25 | + if chunk: |
| 26 | + file.write(chunk) |
| 27 | + downloaded_urls.add(url) |
| 28 | + print(f"Downloaded {file_name}") |
| 29 | + except requests.RequestException as e: |
| 30 | + print(f"Error downloading {url}: {e}") |
| 31 | + |
| 32 | +# Function to check if any keyword is in the URL (case-insensitive) |
| 33 | +def contains_keyword(url, keywords): |
| 34 | + url_lower = url.lower() |
| 35 | + return any(keyword.lower() in url_lower for keyword in keywords) |
| 36 | + |
| 37 | +# Function to find video files in specified tags and attributes |
| 38 | +def find_videos(page_url, folder_path, visited_urls, keywords, downloaded_urls, depth, max_depth, download_tag, explore_tag): |
| 39 | + if depth > max_depth: |
| 40 | + return |
| 41 | + if page_url in visited_urls: |
| 42 | + return |
| 43 | + visited_urls.add(page_url) |
| 44 | + |
| 45 | + try: |
| 46 | + print(f"Fetching page: {page_url}") |
| 47 | + response = requests.get(page_url) |
| 48 | + response.raise_for_status() # Check for HTTP errors |
| 49 | + soup = BeautifulSoup(response.content, 'html.parser') |
| 50 | + |
| 51 | + # Print a snippet of HTML content for debugging |
| 52 | + print("HTML Snippet:") |
| 53 | + print(soup.prettify()[:1000]) # Print the first 1000 characters for a quick overview |
| 54 | + |
| 55 | + # Find video files in specified tags and attributes |
| 56 | + video_found = False |
| 57 | + for tag, attr in download_tag: |
| 58 | + for element in soup.find_all(tag, {attr: True}): |
| 59 | + video_url = element[attr] |
| 60 | + if contains_keyword(video_url, keywords): |
| 61 | + full_url = urljoin(page_url, video_url) |
| 62 | + print(f"Found video URL: {full_url}") |
| 63 | + download_video(full_url, folder_path, downloaded_urls) |
| 64 | + video_found = True |
| 65 | + |
| 66 | + # Find video files in iframes |
| 67 | + for iframe in soup.find_all('iframe', src=True): |
| 68 | + iframe_url = iframe['src'] |
| 69 | + iframe_url = urljoin(page_url, iframe_url) |
| 70 | + print(f"Found iframe URL: {iframe_url}") |
| 71 | + try: |
| 72 | + iframe_response = requests.get(iframe_url) |
| 73 | + iframe_response.raise_for_status() |
| 74 | + iframe_soup = BeautifulSoup(iframe_response.content, 'html.parser') |
| 75 | + video_found = find_videos_in_iframe(iframe_soup, folder_path, keywords, iframe_url, downloaded_urls, download_tag) or video_found |
| 76 | + except requests.RequestException as e: |
| 77 | + print(f"Error fetching iframe content {iframe_url}: {e}") |
| 78 | + |
| 79 | + if not video_found: |
| 80 | + print("No videos found on this page.") |
| 81 | + |
| 82 | + # Recursively follow links on the page with specific keywords |
| 83 | + for tag, attr in explore_tag: |
| 84 | + for link in soup.find_all(tag, {attr: True}): |
| 85 | + link_url = link[attr] |
| 86 | + link_url = urljoin(page_url, link_url) |
| 87 | + if contains_keyword(link_url, keywords): |
| 88 | + with url_lock: |
| 89 | + if link_url not in visited_urls: |
| 90 | + print(f"Adding link to queue: {link_url}") |
| 91 | + url_queue.put((link_url, depth + 1)) |
| 92 | + |
| 93 | + except requests.RequestException as e: |
| 94 | + print(f"Error fetching {page_url}: {e}") |
| 95 | + |
| 96 | +def find_videos_in_iframe(soup, folder_path, keywords, page_url, downloaded_urls, download_tag): |
| 97 | + video_found = False |
| 98 | + for tag, attr in download_tag: |
| 99 | + for element in soup.find_all(tag, {attr: True}): |
| 100 | + video_url = element[attr] |
| 101 | + if contains_keyword(video_url, keywords): |
| 102 | + full_url = urljoin(page_url, video_url) |
| 103 | + print(f"Found video URL in iframe: {full_url}") |
| 104 | + download_video(full_url, folder_path, downloaded_urls) |
| 105 | + video_found = True |
| 106 | + return video_found |
| 107 | + |
| 108 | +# Worker function for exploring URLs |
| 109 | +def explore_urls(start_url, folder_path, keywords, max_depth, download_tag, explore_tag): |
| 110 | + visited_urls = set() |
| 111 | + downloaded_urls = set() |
| 112 | + |
| 113 | + # Start with the initial URL |
| 114 | + url_queue.put((start_url, 0)) |
| 115 | + |
| 116 | + while not url_queue.empty(): |
| 117 | + current_url, current_depth = url_queue.get() |
| 118 | + find_videos(current_url, folder_path, visited_urls, keywords, downloaded_urls, current_depth, max_depth, download_tag, explore_tag) |
| 119 | + url_queue.task_done() |
| 120 | + |
| 121 | +def crawl(start_url, folder_path, keywords, max_depth, download_tag, explore_tag): |
| 122 | + # Create folder path if it does not exist |
| 123 | + if not os.path.exists(folder_path): |
| 124 | + os.makedirs(folder_path) |
| 125 | + |
| 126 | + # Create and start worker threads for parallel exploration |
| 127 | + num_threads = 4 # Number of threads for parallel exploration |
| 128 | + threads = [] |
| 129 | + for _ in range(num_threads): |
| 130 | + thread = Thread(target=explore_urls, args=(start_url, folder_path, keywords, max_depth, download_tag, explore_tag)) |
| 131 | + thread.start() |
| 132 | + threads.append(thread) |
| 133 | + |
| 134 | + # Wait for all threads to complete |
| 135 | + for thread in threads: |
| 136 | + thread.join() |
| 137 | + |
| 138 | +if __name__ == "__main__": |
| 139 | + parser = argparse.ArgumentParser(description='Web crawler to download video files from web pages.') |
| 140 | + parser.add_argument('start_url', type=str, help='The starting URL for the web crawler.') |
| 141 | + parser.add_argument('folder_path', type=str, help='The folder path where videos will be downloaded.') |
| 142 | + parser.add_argument('keywords', type=str, nargs='+', help='Keywords to filter links.') |
| 143 | + parser.add_argument('--max_depth', type=int, default=2, help='Maximum depth to crawl.') |
| 144 | + parser.add_argument('--download_tag', type=str, default='source:src', help='Tag and attribute used to find video sources (format: tag:attribute).') |
| 145 | + parser.add_argument('--explore_tag', type=str, default='a:href', help='Tag and attribute used to find links to explore (format: tag:attribute).') |
| 146 | + |
| 147 | + args = parser.parse_args() |
| 148 | + |
| 149 | + # Parse the tag and attribute arguments |
| 150 | + download_tag = [tuple(tag_attr.split(':')) for tag_attr in args.download_tag.split(',')] |
| 151 | + explore_tag = [tuple(tag_attr.split(':')) for tag_attr in args.explore_tag.split(',')] |
| 152 | + |
| 153 | + crawl(args.start_url, args.folder_path, args.keywords, args.max_depth, download_tag, explore_tag) |
| 154 | + |
0 commit comments