diff --git a/map.py b/map.py new file mode 100644 index 0000000..fc389f9 --- /dev/null +++ b/map.py @@ -0,0 +1,169 @@ +import sqlite3 +import asyncio +import aiohttp +from bs4 import BeautifulSoup +from flask import Flask, jsonify, request, abort, render_template, Response +from flask_cors import CORS + +app = Flask(__name__) +CORS(app) + +progress_stream = [] + +def get_db_connection(): + conn = sqlite3.connect('web_graph.db') + conn.row_factory = sqlite3.Row + return conn + +def insert_node(url): + conn = get_db_connection() + c = conn.cursor() + c.execute('SELECT id FROM nodes WHERE url = ?', (url,)) + node = c.fetchone() + if node is None: + c.execute('INSERT INTO nodes (url) VALUES (?)', (url,)) + conn.commit() + node_id = c.lastrowid + else: + node_id = node['id'] + return node_id + +def insert_edge(source_id, target_id): + conn = get_db_connection() + c = conn.cursor() + # Check if the edge already exists + c.execute('SELECT id FROM edges WHERE source = ? AND target = ?', (source_id, target_id)) + edge = c.fetchone() + if edge is None: + c.execute('INSERT INTO edges (source, target) VALUES (?, ?)', (source_id, target_id)) + conn.commit() + edge_id = c.lastrowid + else: + edge_id = edge['id'] + conn.close() + return edge_id + +def insert_original_search(url, depth): + conn = get_db_connection() + c = conn.cursor() + c.execute('INSERT INTO original_searches (url, max_depth, is_original) VALUES (?, ?, ?)', (url, depth, True)) + conn.commit() + search_id = c.lastrowid + return search_id + +async def fetch_links_async(url, visited, depth=1, max_depth=4): + if depth > max_depth: + return + + source_id = insert_node(url) + + # Only proceed with crawling if the URL hasn't been visited yet + if url not in visited: + visited.add(url) + print(f"Currently fetching links from: {url}") + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + async with aiohttp.ClientSession(headers=headers) as session: + try: + async with session.get(url) as response: + response.raise_for_status() + html_content = await response.text() + soup = BeautifulSoup(html_content, 'html.parser') + tasks = [] + + for link in soup.find_all('a', href=True): + href = link['href'] + if href.startswith('http'): + target_id = insert_node(href) + insert_edge(source_id, target_id) + # Avoid re-crawling if the URL is already visited + if href not in visited: + print(f"Planning to visit: {href}") + task = fetch_links_async(href, visited, depth + 1, max_depth) + tasks.append(task) + + await asyncio.gather(*tasks) + + except (aiohttp.ClientError, aiohttp.ClientResponseError) as e: + print(f"Request failed for {url}: {e}") + except Exception as e: + print(f"Error processing link {url}: {e}") + + if depth == 1: + print(f"Finished going over {url}.") + else: + print(f"Skipping already visited {url}.") + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/test') +def test(): + return render_template('test.html') + +@app.route('/api/search', methods=['GET', 'POST']) +async def search(): + if request.method == 'POST': + data = request.json + starting_url = data.get('starting_url') + max_depth = data.get('max_depth', 4) + else: + starting_url = request.args.get('starting_url') + max_depth = request.args.get('max_depth', 4) + + if not starting_url or not starting_url.startswith('http'): + abort(400, description="Invalid starting URL provided.") + + try: + max_depth = int(max_depth) + except ValueError: + abort(400, description="Invalid max depth provided.") + + original_search_id = insert_original_search(starting_url, max_depth) + + visited_urls = set() + await fetch_links_async(starting_url, visited_urls, max_depth=max_depth) + return jsonify({"message": "Crawl started for {}".format(starting_url), 'original_search_id': original_search_id}) + +@app.route('/api/nodes', methods=['GET']) +def get_nodes(): + conn = get_db_connection() + nodes = conn.execute('SELECT * FROM nodes').fetchall() + conn.close() + return jsonify([dict(node) for node in nodes]) + +@app.route('/api/edges', methods=['GET']) +def get_edges(): + conn = get_db_connection() + edges = conn.execute('SELECT * FROM edges').fetchall() + conn.close() + return jsonify([dict(edge) for edge in edges]) + +@app.route('/api/original_search_ids', methods=['GET']) +def get_original_search_ids(): + conn = get_db_connection() + search_ids = conn.execute('SELECT * FROM original_searches').fetchall() + conn.close() + return jsonify([dict(search_id) for search_id in search_ids]) + +@app.route('/api/cleardb', methods=['POST']) +def clear_db(): + conn = get_db_connection() + try: + conn.execute('DELETE FROM nodes') + conn.execute('DELETE FROM edges') + conn.execute('DELETE FROM original_searches') + conn.commit() + return jsonify({'message': 'Database cleared successfully'}), 200 + except Exception as e: + conn.rollback() + return jsonify({'error': str(e)}), 500 + finally: + conn.close() + +if __name__ == '__main__': + app.run(host='0.0.0.0', port='5000', debug=True) diff --git a/web_graph.db b/web_graph.db new file mode 100644 index 0000000..5db7901 Binary files /dev/null and b/web_graph.db differ