Upload files to "/"
This commit is contained in:
parent
cecd31f124
commit
520814d985
169
map.py
Normal file
169
map.py
Normal file
|
@ -0,0 +1,169 @@
|
|||
import sqlite3
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
from flask import Flask, jsonify, request, abort, render_template, Response
|
||||
from flask_cors import CORS
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
progress_stream = []
|
||||
|
||||
def get_db_connection():
|
||||
conn = sqlite3.connect('web_graph.db')
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def insert_node(url):
|
||||
conn = get_db_connection()
|
||||
c = conn.cursor()
|
||||
c.execute('SELECT id FROM nodes WHERE url = ?', (url,))
|
||||
node = c.fetchone()
|
||||
if node is None:
|
||||
c.execute('INSERT INTO nodes (url) VALUES (?)', (url,))
|
||||
conn.commit()
|
||||
node_id = c.lastrowid
|
||||
else:
|
||||
node_id = node['id']
|
||||
return node_id
|
||||
|
||||
def insert_edge(source_id, target_id):
|
||||
conn = get_db_connection()
|
||||
c = conn.cursor()
|
||||
# Check if the edge already exists
|
||||
c.execute('SELECT id FROM edges WHERE source = ? AND target = ?', (source_id, target_id))
|
||||
edge = c.fetchone()
|
||||
if edge is None:
|
||||
c.execute('INSERT INTO edges (source, target) VALUES (?, ?)', (source_id, target_id))
|
||||
conn.commit()
|
||||
edge_id = c.lastrowid
|
||||
else:
|
||||
edge_id = edge['id']
|
||||
conn.close()
|
||||
return edge_id
|
||||
|
||||
def insert_original_search(url, depth):
|
||||
conn = get_db_connection()
|
||||
c = conn.cursor()
|
||||
c.execute('INSERT INTO original_searches (url, max_depth, is_original) VALUES (?, ?, ?)', (url, depth, True))
|
||||
conn.commit()
|
||||
search_id = c.lastrowid
|
||||
return search_id
|
||||
|
||||
async def fetch_links_async(url, visited, depth=1, max_depth=4):
|
||||
if depth > max_depth:
|
||||
return
|
||||
|
||||
source_id = insert_node(url)
|
||||
|
||||
# Only proceed with crawling if the URL hasn't been visited yet
|
||||
if url not in visited:
|
||||
visited.add(url)
|
||||
print(f"Currently fetching links from: {url}")
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
try:
|
||||
async with session.get(url) as response:
|
||||
response.raise_for_status()
|
||||
html_content = await response.text()
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
tasks = []
|
||||
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
if href.startswith('http'):
|
||||
target_id = insert_node(href)
|
||||
insert_edge(source_id, target_id)
|
||||
# Avoid re-crawling if the URL is already visited
|
||||
if href not in visited:
|
||||
print(f"Planning to visit: {href}")
|
||||
task = fetch_links_async(href, visited, depth + 1, max_depth)
|
||||
tasks.append(task)
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
except (aiohttp.ClientError, aiohttp.ClientResponseError) as e:
|
||||
print(f"Request failed for {url}: {e}")
|
||||
except Exception as e:
|
||||
print(f"Error processing link {url}: {e}")
|
||||
|
||||
if depth == 1:
|
||||
print(f"Finished going over {url}.")
|
||||
else:
|
||||
print(f"Skipping already visited {url}.")
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/test')
|
||||
def test():
|
||||
return render_template('test.html')
|
||||
|
||||
@app.route('/api/search', methods=['GET', 'POST'])
|
||||
async def search():
|
||||
if request.method == 'POST':
|
||||
data = request.json
|
||||
starting_url = data.get('starting_url')
|
||||
max_depth = data.get('max_depth', 4)
|
||||
else:
|
||||
starting_url = request.args.get('starting_url')
|
||||
max_depth = request.args.get('max_depth', 4)
|
||||
|
||||
if not starting_url or not starting_url.startswith('http'):
|
||||
abort(400, description="Invalid starting URL provided.")
|
||||
|
||||
try:
|
||||
max_depth = int(max_depth)
|
||||
except ValueError:
|
||||
abort(400, description="Invalid max depth provided.")
|
||||
|
||||
original_search_id = insert_original_search(starting_url, max_depth)
|
||||
|
||||
visited_urls = set()
|
||||
await fetch_links_async(starting_url, visited_urls, max_depth=max_depth)
|
||||
return jsonify({"message": "Crawl started for {}".format(starting_url), 'original_search_id': original_search_id})
|
||||
|
||||
@app.route('/api/nodes', methods=['GET'])
|
||||
def get_nodes():
|
||||
conn = get_db_connection()
|
||||
nodes = conn.execute('SELECT * FROM nodes').fetchall()
|
||||
conn.close()
|
||||
return jsonify([dict(node) for node in nodes])
|
||||
|
||||
@app.route('/api/edges', methods=['GET'])
|
||||
def get_edges():
|
||||
conn = get_db_connection()
|
||||
edges = conn.execute('SELECT * FROM edges').fetchall()
|
||||
conn.close()
|
||||
return jsonify([dict(edge) for edge in edges])
|
||||
|
||||
@app.route('/api/original_search_ids', methods=['GET'])
|
||||
def get_original_search_ids():
|
||||
conn = get_db_connection()
|
||||
search_ids = conn.execute('SELECT * FROM original_searches').fetchall()
|
||||
conn.close()
|
||||
return jsonify([dict(search_id) for search_id in search_ids])
|
||||
|
||||
@app.route('/api/cleardb', methods=['POST'])
|
||||
def clear_db():
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute('DELETE FROM nodes')
|
||||
conn.execute('DELETE FROM edges')
|
||||
conn.execute('DELETE FROM original_searches')
|
||||
conn.commit()
|
||||
return jsonify({'message': 'Database cleared successfully'}), 200
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return jsonify({'error': str(e)}), 500
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port='5000', debug=True)
|
BIN
web_graph.db
Normal file
BIN
web_graph.db
Normal file
Binary file not shown.
Loading…
Reference in a new issue