WebPath-Explorer/map.py
2024-06-27 21:10:03 +00:00

170 lines
5.7 KiB
Python

import sqlite3
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from flask import Flask, jsonify, request, abort, render_template, Response
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
progress_stream = []
def get_db_connection():
conn = sqlite3.connect('web_graph.db')
conn.row_factory = sqlite3.Row
return conn
def insert_node(url):
conn = get_db_connection()
c = conn.cursor()
c.execute('SELECT id FROM nodes WHERE url = ?', (url,))
node = c.fetchone()
if node is None:
c.execute('INSERT INTO nodes (url) VALUES (?)', (url,))
conn.commit()
node_id = c.lastrowid
else:
node_id = node['id']
return node_id
def insert_edge(source_id, target_id):
conn = get_db_connection()
c = conn.cursor()
# Check if the edge already exists
c.execute('SELECT id FROM edges WHERE source = ? AND target = ?', (source_id, target_id))
edge = c.fetchone()
if edge is None:
c.execute('INSERT INTO edges (source, target) VALUES (?, ?)', (source_id, target_id))
conn.commit()
edge_id = c.lastrowid
else:
edge_id = edge['id']
conn.close()
return edge_id
def insert_original_search(url, depth):
conn = get_db_connection()
c = conn.cursor()
c.execute('INSERT INTO original_searches (url, max_depth, is_original) VALUES (?, ?, ?)', (url, depth, True))
conn.commit()
search_id = c.lastrowid
return search_id
async def fetch_links_async(url, visited, depth=1, max_depth=4):
if depth > max_depth:
return
source_id = insert_node(url)
# Only proceed with crawling if the URL hasn't been visited yet
if url not in visited:
visited.add(url)
print(f"Currently fetching links from: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
async with aiohttp.ClientSession(headers=headers) as session:
try:
async with session.get(url) as response:
response.raise_for_status()
html_content = await response.text()
soup = BeautifulSoup(html_content, 'html.parser')
tasks = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('http'):
target_id = insert_node(href)
insert_edge(source_id, target_id)
# Avoid re-crawling if the URL is already visited
if href not in visited:
print(f"Planning to visit: {href}")
task = fetch_links_async(href, visited, depth + 1, max_depth)
tasks.append(task)
await asyncio.gather(*tasks)
except (aiohttp.ClientError, aiohttp.ClientResponseError) as e:
print(f"Request failed for {url}: {e}")
except Exception as e:
print(f"Error processing link {url}: {e}")
if depth == 1:
print(f"Finished going over {url}.")
else:
print(f"Skipping already visited {url}.")
@app.route('/')
def index():
return render_template('index.html')
@app.route('/test')
def test():
return render_template('test.html')
@app.route('/api/search', methods=['GET', 'POST'])
async def search():
if request.method == 'POST':
data = request.json
starting_url = data.get('starting_url')
max_depth = data.get('max_depth', 4)
else:
starting_url = request.args.get('starting_url')
max_depth = request.args.get('max_depth', 4)
if not starting_url or not starting_url.startswith('http'):
abort(400, description="Invalid starting URL provided.")
try:
max_depth = int(max_depth)
except ValueError:
abort(400, description="Invalid max depth provided.")
original_search_id = insert_original_search(starting_url, max_depth)
visited_urls = set()
await fetch_links_async(starting_url, visited_urls, max_depth=max_depth)
return jsonify({"message": "Crawl started for {}".format(starting_url), 'original_search_id': original_search_id})
@app.route('/api/nodes', methods=['GET'])
def get_nodes():
conn = get_db_connection()
nodes = conn.execute('SELECT * FROM nodes').fetchall()
conn.close()
return jsonify([dict(node) for node in nodes])
@app.route('/api/edges', methods=['GET'])
def get_edges():
conn = get_db_connection()
edges = conn.execute('SELECT * FROM edges').fetchall()
conn.close()
return jsonify([dict(edge) for edge in edges])
@app.route('/api/original_search_ids', methods=['GET'])
def get_original_search_ids():
conn = get_db_connection()
search_ids = conn.execute('SELECT * FROM original_searches').fetchall()
conn.close()
return jsonify([dict(search_id) for search_id in search_ids])
@app.route('/api/cleardb', methods=['POST'])
def clear_db():
conn = get_db_connection()
try:
conn.execute('DELETE FROM nodes')
conn.execute('DELETE FROM edges')
conn.execute('DELETE FROM original_searches')
conn.commit()
return jsonify({'message': 'Database cleared successfully'}), 200
except Exception as e:
conn.rollback()
return jsonify({'error': str(e)}), 500
finally:
conn.close()
if __name__ == '__main__':
app.run(host='0.0.0.0', port='5000', debug=True)