Upload files to "/"
This commit is contained in:
parent
cecd31f124
commit
520814d985
169
map.py
Normal file
169
map.py
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
import sqlite3
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from flask import Flask, jsonify, request, abort, render_template, Response
|
||||||
|
from flask_cors import CORS
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
CORS(app)
|
||||||
|
|
||||||
|
progress_stream = []
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
conn = sqlite3.connect('web_graph.db')
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
return conn
|
||||||
|
|
||||||
|
def insert_node(url):
|
||||||
|
conn = get_db_connection()
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute('SELECT id FROM nodes WHERE url = ?', (url,))
|
||||||
|
node = c.fetchone()
|
||||||
|
if node is None:
|
||||||
|
c.execute('INSERT INTO nodes (url) VALUES (?)', (url,))
|
||||||
|
conn.commit()
|
||||||
|
node_id = c.lastrowid
|
||||||
|
else:
|
||||||
|
node_id = node['id']
|
||||||
|
return node_id
|
||||||
|
|
||||||
|
def insert_edge(source_id, target_id):
|
||||||
|
conn = get_db_connection()
|
||||||
|
c = conn.cursor()
|
||||||
|
# Check if the edge already exists
|
||||||
|
c.execute('SELECT id FROM edges WHERE source = ? AND target = ?', (source_id, target_id))
|
||||||
|
edge = c.fetchone()
|
||||||
|
if edge is None:
|
||||||
|
c.execute('INSERT INTO edges (source, target) VALUES (?, ?)', (source_id, target_id))
|
||||||
|
conn.commit()
|
||||||
|
edge_id = c.lastrowid
|
||||||
|
else:
|
||||||
|
edge_id = edge['id']
|
||||||
|
conn.close()
|
||||||
|
return edge_id
|
||||||
|
|
||||||
|
def insert_original_search(url, depth):
|
||||||
|
conn = get_db_connection()
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute('INSERT INTO original_searches (url, max_depth, is_original) VALUES (?, ?, ?)', (url, depth, True))
|
||||||
|
conn.commit()
|
||||||
|
search_id = c.lastrowid
|
||||||
|
return search_id
|
||||||
|
|
||||||
|
async def fetch_links_async(url, visited, depth=1, max_depth=4):
|
||||||
|
if depth > max_depth:
|
||||||
|
return
|
||||||
|
|
||||||
|
source_id = insert_node(url)
|
||||||
|
|
||||||
|
# Only proceed with crawling if the URL hasn't been visited yet
|
||||||
|
if url not in visited:
|
||||||
|
visited.add(url)
|
||||||
|
print(f"Currently fetching links from: {url}")
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
|
try:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
html_content = await response.text()
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
href = link['href']
|
||||||
|
if href.startswith('http'):
|
||||||
|
target_id = insert_node(href)
|
||||||
|
insert_edge(source_id, target_id)
|
||||||
|
# Avoid re-crawling if the URL is already visited
|
||||||
|
if href not in visited:
|
||||||
|
print(f"Planning to visit: {href}")
|
||||||
|
task = fetch_links_async(href, visited, depth + 1, max_depth)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
except (aiohttp.ClientError, aiohttp.ClientResponseError) as e:
|
||||||
|
print(f"Request failed for {url}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing link {url}: {e}")
|
||||||
|
|
||||||
|
if depth == 1:
|
||||||
|
print(f"Finished going over {url}.")
|
||||||
|
else:
|
||||||
|
print(f"Skipping already visited {url}.")
|
||||||
|
|
||||||
|
@app.route('/')
|
||||||
|
def index():
|
||||||
|
return render_template('index.html')
|
||||||
|
|
||||||
|
@app.route('/test')
|
||||||
|
def test():
|
||||||
|
return render_template('test.html')
|
||||||
|
|
||||||
|
@app.route('/api/search', methods=['GET', 'POST'])
|
||||||
|
async def search():
|
||||||
|
if request.method == 'POST':
|
||||||
|
data = request.json
|
||||||
|
starting_url = data.get('starting_url')
|
||||||
|
max_depth = data.get('max_depth', 4)
|
||||||
|
else:
|
||||||
|
starting_url = request.args.get('starting_url')
|
||||||
|
max_depth = request.args.get('max_depth', 4)
|
||||||
|
|
||||||
|
if not starting_url or not starting_url.startswith('http'):
|
||||||
|
abort(400, description="Invalid starting URL provided.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
max_depth = int(max_depth)
|
||||||
|
except ValueError:
|
||||||
|
abort(400, description="Invalid max depth provided.")
|
||||||
|
|
||||||
|
original_search_id = insert_original_search(starting_url, max_depth)
|
||||||
|
|
||||||
|
visited_urls = set()
|
||||||
|
await fetch_links_async(starting_url, visited_urls, max_depth=max_depth)
|
||||||
|
return jsonify({"message": "Crawl started for {}".format(starting_url), 'original_search_id': original_search_id})
|
||||||
|
|
||||||
|
@app.route('/api/nodes', methods=['GET'])
|
||||||
|
def get_nodes():
|
||||||
|
conn = get_db_connection()
|
||||||
|
nodes = conn.execute('SELECT * FROM nodes').fetchall()
|
||||||
|
conn.close()
|
||||||
|
return jsonify([dict(node) for node in nodes])
|
||||||
|
|
||||||
|
@app.route('/api/edges', methods=['GET'])
|
||||||
|
def get_edges():
|
||||||
|
conn = get_db_connection()
|
||||||
|
edges = conn.execute('SELECT * FROM edges').fetchall()
|
||||||
|
conn.close()
|
||||||
|
return jsonify([dict(edge) for edge in edges])
|
||||||
|
|
||||||
|
@app.route('/api/original_search_ids', methods=['GET'])
|
||||||
|
def get_original_search_ids():
|
||||||
|
conn = get_db_connection()
|
||||||
|
search_ids = conn.execute('SELECT * FROM original_searches').fetchall()
|
||||||
|
conn.close()
|
||||||
|
return jsonify([dict(search_id) for search_id in search_ids])
|
||||||
|
|
||||||
|
@app.route('/api/cleardb', methods=['POST'])
|
||||||
|
def clear_db():
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
conn.execute('DELETE FROM nodes')
|
||||||
|
conn.execute('DELETE FROM edges')
|
||||||
|
conn.execute('DELETE FROM original_searches')
|
||||||
|
conn.commit()
|
||||||
|
return jsonify({'message': 'Database cleared successfully'}), 200
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
return jsonify({'error': str(e)}), 500
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(host='0.0.0.0', port='5000', debug=True)
|
BIN
web_graph.db
Normal file
BIN
web_graph.db
Normal file
Binary file not shown.
Loading…
Reference in a new issue