WebPath-Explorer/map.py

import sqlite3
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from flask import Flask, jsonify, request, abort, render_template, Response
from flask_cors import CORS

app = Flask(__name__)
CORS(app)

progress_stream = []

def get_db_connection():
    conn = sqlite3.connect('web_graph.db')
    conn.row_factory = sqlite3.Row
    return conn

def insert_node(url):
    conn = get_db_connection()
    c = conn.cursor()
    c.execute('SELECT id FROM nodes WHERE url = ?', (url,))
    node = c.fetchone()
    if node is None:
        c.execute('INSERT INTO nodes (url) VALUES (?)', (url,))
        conn.commit()
        node_id = c.lastrowid
    else:
        node_id = node['id']
    return node_id

def insert_edge(source_id, target_id):
    conn = get_db_connection()
    c = conn.cursor()
    # Check if the edge already exists
    c.execute('SELECT id FROM edges WHERE source = ? AND target = ?', (source_id, target_id))
    edge = c.fetchone()
    if edge is None:
        c.execute('INSERT INTO edges (source, target) VALUES (?, ?)', (source_id, target_id))
        conn.commit()
        edge_id = c.lastrowid
    else:
        edge_id = edge['id']
    conn.close()
    return edge_id

def insert_original_search(url, depth):
    conn = get_db_connection()
    c = conn.cursor()
    c.execute('INSERT INTO original_searches (url, max_depth, is_original) VALUES (?, ?, ?)', (url, depth, True))
    conn.commit()
    search_id = c.lastrowid
    return search_id

async def fetch_links_async(url, visited, depth=1, max_depth=4):
    if depth > max_depth:
        return

    source_id = insert_node(url)

    # Only proceed with crawling if the URL hasn't been visited yet
    if url not in visited:
        visited.add(url)
        print(f"Currently fetching links from: {url}")

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

        async with aiohttp.ClientSession(headers=headers) as session:
            try:
                async with session.get(url) as response:
                    response.raise_for_status()
                    html_content = await response.text()
                    soup = BeautifulSoup(html_content, 'html.parser')
                    tasks = []

                    for link in soup.find_all('a', href=True):
                        href = link['href']
                        if href.startswith('http'):
                            target_id = insert_node(href)
                            insert_edge(source_id, target_id)
                            # Avoid re-crawling if the URL is already visited
                            if href not in visited:
                                print(f"Planning to visit: {href}")
                                task = fetch_links_async(href, visited, depth + 1, max_depth)
                                tasks.append(task)

                    await asyncio.gather(*tasks)

            except (aiohttp.ClientError, aiohttp.ClientResponseError) as e:
                print(f"Request failed for {url}: {e}")
            except Exception as e:
                print(f"Error processing link {url}: {e}")

        if depth == 1:
            print(f"Finished going over {url}.")
    else:
        print(f"Skipping already visited {url}.")

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/test')
def test():
    return render_template('test.html')

@app.route('/api/search', methods=['GET', 'POST'])
async def search():
    if request.method == 'POST':
        data = request.json
        starting_url = data.get('starting_url')
        max_depth = data.get('max_depth', 4)
    else:
        starting_url = request.args.get('starting_url')
        max_depth = request.args.get('max_depth', 4)

    if not starting_url or not starting_url.startswith('http'):
        abort(400, description="Invalid starting URL provided.")

    try:
        max_depth = int(max_depth)
    except ValueError:
        abort(400, description="Invalid max depth provided.")

    original_search_id = insert_original_search(starting_url, max_depth)

    visited_urls = set()
    await fetch_links_async(starting_url, visited_urls, max_depth=max_depth)
    return jsonify({"message": "Crawl started for {}".format(starting_url), 'original_search_id': original_search_id})

@app.route('/api/nodes', methods=['GET'])
def get_nodes():
    conn = get_db_connection()
    nodes = conn.execute('SELECT * FROM nodes').fetchall()
    conn.close()
    return jsonify([dict(node) for node in nodes])

@app.route('/api/edges', methods=['GET'])
def get_edges():
    conn = get_db_connection()
    edges = conn.execute('SELECT * FROM edges').fetchall()
    conn.close()
    return jsonify([dict(edge) for edge in edges])

@app.route('/api/original_search_ids', methods=['GET'])
def get_original_search_ids():
    conn = get_db_connection()
    search_ids = conn.execute('SELECT * FROM original_searches').fetchall()
    conn.close()
    return jsonify([dict(search_id) for search_id in search_ids])

@app.route('/api/cleardb', methods=['POST'])
def clear_db():
    conn = get_db_connection()
    try:
        conn.execute('DELETE FROM nodes')
        conn.execute('DELETE FROM edges')
        conn.execute('DELETE FROM original_searches')
        conn.commit()
        return jsonify({'message': 'Database cleared successfully'}), 200
    except Exception as e:
        conn.rollback()
        return jsonify({'error': str(e)}), 500
    finally:
        conn.close()

if __name__ == '__main__':
    app.run(host='0.0.0.0', port='5000', debug=True)