Spaces:
Running
Running
neon_arch
commited on
Commit
·
c170de8
1
Parent(s):
f8c3c8d
add code to evade ip blocking, improve pagination code and fix documentation
Browse files- Cargo.lock +39 -0
- Cargo.toml +5 -2
- src/cache/cacher.rs +78 -0
- src/cache/mod.rs +1 -0
- src/config_parser/parser.rs +5 -0
- src/config_parser/parser_models.rs +11 -8
- src/engines/duckduckgo.rs +27 -17
- src/engines/searx.rs +18 -16
- src/lib.rs +1 -0
- src/search_results_handler/aggregation_models.rs +7 -7
- src/search_results_handler/aggregator.rs +2 -2
- src/server/routes.rs +65 -9
- tests/index.rs +2 -0
- websurfx/config.lua +3 -0
Cargo.lock
CHANGED
|
@@ -447,6 +447,16 @@ dependencies = [
|
|
| 447 |
"bitflags",
|
| 448 |
]
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
[[package]]
|
| 451 |
name = "convert_case"
|
| 452 |
version = "0.4.0"
|
|
@@ -1427,6 +1437,12 @@ version = "2.0.0"
|
|
| 1427 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 1428 |
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
| 1429 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1430 |
[[package]]
|
| 1431 |
name = "memchr"
|
| 1432 |
version = "2.5.0"
|
|
@@ -2157,6 +2173,20 @@ dependencies = [
|
|
| 2157 |
"rand_core 0.3.1",
|
| 2158 |
]
|
| 2159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2160 |
[[package]]
|
| 2161 |
name = "redox_syscall"
|
| 2162 |
version = "0.1.57"
|
|
@@ -2526,6 +2556,12 @@ dependencies = [
|
|
| 2526 |
"digest",
|
| 2527 |
]
|
| 2528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2529 |
[[package]]
|
| 2530 |
name = "sha2"
|
| 2531 |
version = "0.10.6"
|
|
@@ -3291,6 +3327,9 @@ dependencies = [
|
|
| 3291 |
"fake-useragent",
|
| 3292 |
"handlebars",
|
| 3293 |
"log",
|
|
|
|
|
|
|
|
|
|
| 3294 |
"reqwest 0.11.17",
|
| 3295 |
"rlua",
|
| 3296 |
"scraper",
|
|
|
|
| 447 |
"bitflags",
|
| 448 |
]
|
| 449 |
|
| 450 |
+
[[package]]
|
| 451 |
+
name = "combine"
|
| 452 |
+
version = "4.6.6"
|
| 453 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 454 |
+
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
|
| 455 |
+
dependencies = [
|
| 456 |
+
"bytes 1.4.0",
|
| 457 |
+
"memchr",
|
| 458 |
+
]
|
| 459 |
+
|
| 460 |
[[package]]
|
| 461 |
name = "convert_case"
|
| 462 |
version = "0.4.0"
|
|
|
|
| 1437 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 1438 |
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
| 1439 |
|
| 1440 |
+
[[package]]
|
| 1441 |
+
name = "md5"
|
| 1442 |
+
version = "0.7.0"
|
| 1443 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 1444 |
+
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
| 1445 |
+
|
| 1446 |
[[package]]
|
| 1447 |
name = "memchr"
|
| 1448 |
version = "2.5.0"
|
|
|
|
| 2173 |
"rand_core 0.3.1",
|
| 2174 |
]
|
| 2175 |
|
| 2176 |
+
[[package]]
|
| 2177 |
+
name = "redis"
|
| 2178 |
+
version = "0.23.0"
|
| 2179 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 2180 |
+
checksum = "3ea8c51b5dc1d8e5fd3350ec8167f464ec0995e79f2e90a075b63371500d557f"
|
| 2181 |
+
dependencies = [
|
| 2182 |
+
"combine",
|
| 2183 |
+
"itoa 1.0.6",
|
| 2184 |
+
"percent-encoding 2.2.0",
|
| 2185 |
+
"ryu",
|
| 2186 |
+
"sha1_smol",
|
| 2187 |
+
"url 2.3.1",
|
| 2188 |
+
]
|
| 2189 |
+
|
| 2190 |
[[package]]
|
| 2191 |
name = "redox_syscall"
|
| 2192 |
version = "0.1.57"
|
|
|
|
| 2556 |
"digest",
|
| 2557 |
]
|
| 2558 |
|
| 2559 |
+
[[package]]
|
| 2560 |
+
name = "sha1_smol"
|
| 2561 |
+
version = "1.0.0"
|
| 2562 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 2563 |
+
checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
| 2564 |
+
|
| 2565 |
[[package]]
|
| 2566 |
name = "sha2"
|
| 2567 |
version = "0.10.6"
|
|
|
|
| 3327 |
"fake-useragent",
|
| 3328 |
"handlebars",
|
| 3329 |
"log",
|
| 3330 |
+
"md5",
|
| 3331 |
+
"rand 0.6.5",
|
| 3332 |
+
"redis",
|
| 3333 |
"reqwest 0.11.17",
|
| 3334 |
"rlua",
|
| 3335 |
"scraper",
|
Cargo.toml
CHANGED
|
@@ -15,6 +15,9 @@ actix-web = {version="4.3.1"}
|
|
| 15 |
actix-files = {version="0.6.2"}
|
| 16 |
serde_json = {version="*"}
|
| 17 |
fake-useragent = {version="*"}
|
| 18 |
-
env_logger = "0.10.0"
|
| 19 |
-
log = "0.4.17"
|
| 20 |
rlua = {version="*"}
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
actix-files = {version="0.6.2"}
|
| 16 |
serde_json = {version="*"}
|
| 17 |
fake-useragent = {version="*"}
|
| 18 |
+
env_logger = {version="0.10.0"}
|
| 19 |
+
log = {version="0.4.17"}
|
| 20 |
rlua = {version="*"}
|
| 21 |
+
redis = {version="*"}
|
| 22 |
+
md5 = {version="*"}
|
| 23 |
+
rand={version="*"}
|
src/cache/cacher.rs
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//! This module provides the functionality to cache the aggregated results fetched and aggregated
|
| 2 |
+
//! from the upstream search engines in a json format.
|
| 3 |
+
|
| 4 |
+
use md5::compute;
|
| 5 |
+
use redis::{Client, Commands, Connection};
|
| 6 |
+
|
| 7 |
+
/// A named struct which stores the redis Connection url address to which the client will
|
| 8 |
+
/// connect to.
|
| 9 |
+
///
|
| 10 |
+
/// # Fields
|
| 11 |
+
///
|
| 12 |
+
/// * `redis_connection_url` - It stores the redis Connection url address.
|
| 13 |
+
#[derive(Clone)]
|
| 14 |
+
pub struct RedisCache {
|
| 15 |
+
redis_connection_url: String,
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
impl RedisCache {
|
| 19 |
+
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
|
| 20 |
+
///
|
| 21 |
+
/// # Arguments
|
| 22 |
+
///
|
| 23 |
+
/// * `redis_connection_url` - It stores the redis Connection url address.
|
| 24 |
+
pub fn new(redis_connection_url: String) -> Self {
|
| 25 |
+
RedisCache {
|
| 26 |
+
redis_connection_url,
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
/// A helper function which computes the hash of the url and formats and returns it as string.
|
| 31 |
+
///
|
| 32 |
+
/// # Arguments
|
| 33 |
+
///
|
| 34 |
+
/// * `url` - It takes an url as string.
|
| 35 |
+
fn compute_url_hash(self, url: &str) -> String {
|
| 36 |
+
format!("{:?}", compute(url))
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
/// A function which fetches the cached json results as json string from the redis server.
|
| 40 |
+
///
|
| 41 |
+
/// # Arguments
|
| 42 |
+
///
|
| 43 |
+
/// * `url` - It takes an url as a string.
|
| 44 |
+
pub fn cached_results_json(self, url: String) -> Result<String, Box<dyn std::error::Error>> {
|
| 45 |
+
let hashed_url_string = self.clone().compute_url_hash(&url);
|
| 46 |
+
let mut redis_connection: Connection =
|
| 47 |
+
Client::open(self.redis_connection_url)?.get_connection()?;
|
| 48 |
+
Ok(redis_connection.get(hashed_url_string)?)
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/// A function which caches the results by using the hashed `url` as the key and
|
| 52 |
+
/// `json results` as the value and stores it in redis server with ttl(time to live)
|
| 53 |
+
/// set to 60 seconds.
|
| 54 |
+
///
|
| 55 |
+
/// # Arguments
|
| 56 |
+
///
|
| 57 |
+
/// * `json_results` - It takes the json results string as an argument.
|
| 58 |
+
/// * `url` - It takes the url as a String.
|
| 59 |
+
pub fn cache_results(
|
| 60 |
+
self,
|
| 61 |
+
json_results: String,
|
| 62 |
+
url: String,
|
| 63 |
+
) -> Result<(), Box<dyn std::error::Error>> {
|
| 64 |
+
let hashed_url_string = self.clone().compute_url_hash(&url);
|
| 65 |
+
let mut redis_connection: Connection =
|
| 66 |
+
Client::open(self.redis_connection_url)?.get_connection()?;
|
| 67 |
+
|
| 68 |
+
// put results_json into cache
|
| 69 |
+
redis_connection.set(hashed_url_string.clone(), json_results)?;
|
| 70 |
+
|
| 71 |
+
// Set the TTL for the key to 60 seconds
|
| 72 |
+
redis_connection
|
| 73 |
+
.expire::<String, u32>(hashed_url_string.clone(), 60)
|
| 74 |
+
.unwrap();
|
| 75 |
+
|
| 76 |
+
Ok(())
|
| 77 |
+
}
|
| 78 |
+
}
|
src/cache/mod.rs
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pub mod cacher;
|
src/config_parser/parser.rs
CHANGED
|
@@ -11,11 +11,15 @@ use std::fs;
|
|
| 11 |
//
|
| 12 |
/// * `port` - It stores the parsed port number option on which the server should launch.
|
| 13 |
/// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch
|
|
|
|
|
|
|
|
|
|
| 14 |
#[derive(Clone)]
|
| 15 |
pub struct Config {
|
| 16 |
pub port: u16,
|
| 17 |
pub binding_ip_addr: String,
|
| 18 |
pub style: Style,
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
impl Config {
|
|
@@ -44,6 +48,7 @@ impl Config {
|
|
| 44 |
globals.get::<_, String>("theme")?,
|
| 45 |
globals.get::<_, String>("colorscheme")?,
|
| 46 |
),
|
|
|
|
| 47 |
})
|
| 48 |
})
|
| 49 |
}
|
|
|
|
| 11 |
//
|
| 12 |
/// * `port` - It stores the parsed port number option on which the server should launch.
|
| 13 |
/// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch
|
| 14 |
+
/// * `style` - It stores the theming options for the website.
|
| 15 |
+
/// * `redis_connection_url` - It stores the redis connection url address on which the redis
|
| 16 |
+
/// client should connect.
|
| 17 |
#[derive(Clone)]
|
| 18 |
pub struct Config {
|
| 19 |
pub port: u16,
|
| 20 |
pub binding_ip_addr: String,
|
| 21 |
pub style: Style,
|
| 22 |
+
pub redis_connection_url: String,
|
| 23 |
}
|
| 24 |
|
| 25 |
impl Config {
|
|
|
|
| 48 |
globals.get::<_, String>("theme")?,
|
| 49 |
globals.get::<_, String>("colorscheme")?,
|
| 50 |
),
|
| 51 |
+
redis_connection_url: globals.get::<_, String>("redis_connection_url")?,
|
| 52 |
})
|
| 53 |
})
|
| 54 |
}
|
src/config_parser/parser_models.rs
CHANGED
|
@@ -1,21 +1,24 @@
|
|
| 1 |
//! This module provides public models for handling, storing and serializing parsed config file
|
| 2 |
//! options from config.lua by grouping them togather.
|
| 3 |
|
| 4 |
-
use serde::Serialize;
|
| 5 |
|
| 6 |
-
/// A named struct which stores, serializes and groups the parsed config file options
|
| 7 |
-
/// colorscheme names into the Style struct which derives the `Clone
|
| 8 |
-
/// where the `Clone` trait is derived for allowing the struct to be
|
| 9 |
-
/// server as a shared data between all routes except `/robots.txt` and
|
| 10 |
-
/// has been derived for allowing the object to be serialized so that it
|
| 11 |
-
/// handlebars template files
|
|
|
|
|
|
|
|
|
|
| 12 |
///
|
| 13 |
/// # Fields
|
| 14 |
//
|
| 15 |
/// * `theme` - It stores the parsed theme option used to set a theme for the website.
|
| 16 |
/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
|
| 17 |
/// theme being used.
|
| 18 |
-
#[derive(Serialize, Clone)]
|
| 19 |
pub struct Style {
|
| 20 |
pub theme: String,
|
| 21 |
pub colorscheme: String,
|
|
|
|
| 1 |
//! This module provides public models for handling, storing and serializing parsed config file
|
| 2 |
//! options from config.lua by grouping them togather.
|
| 3 |
|
| 4 |
+
use serde::{Deserialize, Serialize};
|
| 5 |
|
| 6 |
+
/// A named struct which stores,deserializes, serializes and groups the parsed config file options
|
| 7 |
+
/// of theme and colorscheme names into the Style struct which derives the `Clone`, `Serialize`
|
| 8 |
+
/// and Deserialize traits where the `Clone` trait is derived for allowing the struct to be
|
| 9 |
+
/// cloned and passed to the server as a shared data between all routes except `/robots.txt` and
|
| 10 |
+
/// the `Serialize` trait has been derived for allowing the object to be serialized so that it
|
| 11 |
+
/// can be passed to handlebars template files and the `Deserialize` trait has been derived in
|
| 12 |
+
/// order to allow the deserializing the json back to struct in aggregate function in
|
| 13 |
+
/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
|
| 14 |
+
/// it to the template files.
|
| 15 |
///
|
| 16 |
/// # Fields
|
| 17 |
//
|
| 18 |
/// * `theme` - It stores the parsed theme option used to set a theme for the website.
|
| 19 |
/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
|
| 20 |
/// theme being used.
|
| 21 |
+
#[derive(Serialize, Deserialize, Clone)]
|
| 22 |
pub struct Style {
|
| 23 |
pub theme: String,
|
| 24 |
pub colorscheme: String,
|
src/engines/duckduckgo.rs
CHANGED
|
@@ -2,9 +2,10 @@
|
|
| 2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
| 3 |
//! number if provided.
|
| 4 |
|
| 5 |
-
use std::collections::HashMap;
|
| 6 |
|
| 7 |
-
use
|
|
|
|
| 8 |
use scraper::{Html, Selector};
|
| 9 |
|
| 10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
@@ -17,7 +18,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
| 17 |
/// # Arguments
|
| 18 |
///
|
| 19 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
| 20 |
-
/// * `page` - Takes an
|
| 21 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
| 22 |
///
|
| 23 |
/// # Errors
|
|
@@ -27,32 +28,41 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
| 27 |
/// selector fails to initialize"
|
| 28 |
pub async fn results(
|
| 29 |
query: &str,
|
| 30 |
-
page:
|
| 31 |
user_agent: &str,
|
| 32 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
| 33 |
// Page number can be missing or empty string and so appropriate handling is required
|
| 34 |
// so that upstream server recieves valid page number.
|
| 35 |
let url: String = match page {
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
}
|
| 47 |
}
|
| 48 |
-
None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"),
|
| 49 |
};
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
// fetch the html from upstream duckduckgo engine
|
| 52 |
// TODO: Write better error handling code to handle no results case.
|
| 53 |
let results: String = reqwest::Client::new()
|
| 54 |
.get(url)
|
| 55 |
-
.
|
| 56 |
.send()
|
| 57 |
.await?
|
| 58 |
.text()
|
|
|
|
| 2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
| 3 |
//! number if provided.
|
| 4 |
|
| 5 |
+
use std::{collections::HashMap, time::Duration};
|
| 6 |
|
| 7 |
+
use rand::Rng;
|
| 8 |
+
use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
|
| 9 |
use scraper::{Html, Selector};
|
| 10 |
|
| 11 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
|
|
| 18 |
/// # Arguments
|
| 19 |
///
|
| 20 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
| 21 |
+
/// * `page` - Takes an u32 as an argument.
|
| 22 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
| 23 |
///
|
| 24 |
/// # Errors
|
|
|
|
| 28 |
/// selector fails to initialize"
|
| 29 |
pub async fn results(
|
| 30 |
query: &str,
|
| 31 |
+
page: u32,
|
| 32 |
user_agent: &str,
|
| 33 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
| 34 |
// Page number can be missing or empty string and so appropriate handling is required
|
| 35 |
// so that upstream server recieves valid page number.
|
| 36 |
let url: String = match page {
|
| 37 |
+
1 => {
|
| 38 |
+
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
| 39 |
+
}
|
| 40 |
+
_ => {
|
| 41 |
+
format!(
|
| 42 |
+
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
|
| 43 |
+
query,
|
| 44 |
+
(page / 2 + (page % 2)) * 30,
|
| 45 |
+
(page / 2 + (page % 2)) * 30 + 1
|
| 46 |
+
)
|
|
|
|
| 47 |
}
|
|
|
|
| 48 |
};
|
| 49 |
|
| 50 |
+
// Add a random delay before making the request.
|
| 51 |
+
let mut rng = rand::thread_rng();
|
| 52 |
+
let delay_secs = rng.gen_range(1, 10);
|
| 53 |
+
std::thread::sleep(Duration::from_secs(delay_secs));
|
| 54 |
+
|
| 55 |
+
// initializing HeaderMap and adding appropriate headers.
|
| 56 |
+
let mut header_map = HeaderMap::new();
|
| 57 |
+
header_map.insert(USER_AGENT, user_agent.parse()?);
|
| 58 |
+
header_map.insert(REFERER, "https://google.com/".parse()?);
|
| 59 |
+
header_map.insert(CONTENT_TYPE, "text/html; charset=UTF-8".parse()?);
|
| 60 |
+
|
| 61 |
// fetch the html from upstream duckduckgo engine
|
| 62 |
// TODO: Write better error handling code to handle no results case.
|
| 63 |
let results: String = reqwest::Client::new()
|
| 64 |
.get(url)
|
| 65 |
+
.headers(header_map) // add spoofed headers to emulate human behaviour
|
| 66 |
.send()
|
| 67 |
.await?
|
| 68 |
.text()
|
src/engines/searx.rs
CHANGED
|
@@ -2,10 +2,10 @@
|
|
| 2 |
//! by querying the upstream searx search engine instance with user provided query and with a page
|
| 3 |
//! number if provided.
|
| 4 |
|
| 5 |
-
use
|
| 6 |
-
|
| 7 |
-
use reqwest::header::USER_AGENT;
|
| 8 |
use scraper::{Html, Selector};
|
|
|
|
| 9 |
|
| 10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
| 11 |
|
|
@@ -17,7 +17,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
| 17 |
/// # Arguments
|
| 18 |
///
|
| 19 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
| 20 |
-
/// * `page` - Takes an
|
| 21 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
| 22 |
///
|
| 23 |
/// # Errors
|
|
@@ -27,27 +27,29 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
| 27 |
/// selector fails to initialize"
|
| 28 |
pub async fn results(
|
| 29 |
query: &str,
|
| 30 |
-
page:
|
| 31 |
user_agent: &str,
|
| 32 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
| 33 |
// Page number can be missing or empty string and so appropriate handling is required
|
| 34 |
// so that upstream server recieves valid page number.
|
| 35 |
-
let url: String =
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
| 45 |
|
| 46 |
// fetch the html from upstream searx instance engine
|
| 47 |
// TODO: Write better error handling code to handle no results case.
|
| 48 |
let results: String = reqwest::Client::new()
|
| 49 |
.get(url)
|
| 50 |
-
.
|
| 51 |
.send()
|
| 52 |
.await?
|
| 53 |
.text()
|
|
|
|
| 2 |
//! by querying the upstream searx search engine instance with user provided query and with a page
|
| 3 |
//! number if provided.
|
| 4 |
|
| 5 |
+
use rand::Rng;
|
| 6 |
+
use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
|
|
|
|
| 7 |
use scraper::{Html, Selector};
|
| 8 |
+
use std::{collections::HashMap, time::Duration};
|
| 9 |
|
| 10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
| 11 |
|
|
|
|
| 17 |
/// # Arguments
|
| 18 |
///
|
| 19 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
| 20 |
+
/// * `page` - Takes an u32 as an argument.
|
| 21 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
| 22 |
///
|
| 23 |
/// # Errors
|
|
|
|
| 27 |
/// selector fails to initialize"
|
| 28 |
pub async fn results(
|
| 29 |
query: &str,
|
| 30 |
+
page: u32,
|
| 31 |
user_agent: &str,
|
| 32 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
| 33 |
// Page number can be missing or empty string and so appropriate handling is required
|
| 34 |
// so that upstream server recieves valid page number.
|
| 35 |
+
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
| 36 |
+
|
| 37 |
+
// Add random delay before making the request.
|
| 38 |
+
let mut rng = rand::thread_rng();
|
| 39 |
+
let delay_secs = rng.gen_range(1, 10);
|
| 40 |
+
std::thread::sleep(Duration::from_secs(delay_secs));
|
| 41 |
+
|
| 42 |
+
// initializing headers and adding appropriate headers.
|
| 43 |
+
let mut header_map = HeaderMap::new();
|
| 44 |
+
header_map.insert(USER_AGENT, user_agent.parse()?);
|
| 45 |
+
header_map.insert(REFERER, "https://google.com/".parse()?);
|
| 46 |
+
header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?);
|
| 47 |
|
| 48 |
// fetch the html from upstream searx instance engine
|
| 49 |
// TODO: Write better error handling code to handle no results case.
|
| 50 |
let results: String = reqwest::Client::new()
|
| 51 |
.get(url)
|
| 52 |
+
.headers(header_map) // add spoofed headers to emulate human behaviours.
|
| 53 |
.send()
|
| 54 |
.await?
|
| 55 |
.text()
|
src/lib.rs
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
//! This main library module provides the functionality to provide and handle the Tcp server
|
| 2 |
//! and register all the routes for the `websurfx` meta search engine website.
|
| 3 |
|
|
|
|
| 4 |
pub mod config_parser;
|
| 5 |
pub mod engines;
|
| 6 |
pub mod search_results_handler;
|
|
|
|
| 1 |
//! This main library module provides the functionality to provide and handle the Tcp server
|
| 2 |
//! and register all the routes for the `websurfx` meta search engine website.
|
| 3 |
|
| 4 |
+
pub mod cache;
|
| 5 |
pub mod config_parser;
|
| 6 |
pub mod engines;
|
| 7 |
pub mod search_results_handler;
|
src/search_results_handler/aggregation_models.rs
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
//! This module provides public models for handling, storing and serializing of search results
|
| 2 |
//! data scraped from the upstream search engines.
|
| 3 |
|
| 4 |
-
use serde::Serialize;
|
| 5 |
|
| 6 |
use crate::config_parser::parser_models::Style;
|
| 7 |
|
| 8 |
-
/// A named struct to store and
|
| 9 |
-
/// and aggregated search results from the upstream search engines.
|
| 10 |
///
|
| 11 |
/// # Fields
|
| 12 |
///
|
|
@@ -16,7 +16,7 @@ use crate::config_parser::parser_models::Style;
|
|
| 16 |
/// * `url` - The url to be displayed below the search result title in html.
|
| 17 |
/// * `description` - The description of the search result.
|
| 18 |
/// * `engine` - The names of the upstream engines from which this results were provided.
|
| 19 |
-
#[derive(Debug, Serialize)]
|
| 20 |
#[serde(rename_all = "camelCase")]
|
| 21 |
pub struct SearchResult {
|
| 22 |
pub title: String,
|
|
@@ -116,15 +116,15 @@ impl RawSearchResult {
|
|
| 116 |
}
|
| 117 |
}
|
| 118 |
|
| 119 |
-
/// A named struct to store
|
| 120 |
-
/// from the upstream search engines.
|
| 121 |
///
|
| 122 |
/// # Fields
|
| 123 |
///
|
| 124 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
| 125 |
/// `SearchResult` structs.
|
| 126 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
| 127 |
-
#[derive(Serialize)]
|
| 128 |
#[serde(rename_all = "camelCase")]
|
| 129 |
pub struct SearchResults {
|
| 130 |
pub results: Vec<SearchResult>,
|
|
|
|
| 1 |
//! This module provides public models for handling, storing and serializing of search results
|
| 2 |
//! data scraped from the upstream search engines.
|
| 3 |
|
| 4 |
+
use serde::{Deserialize, Serialize};
|
| 5 |
|
| 6 |
use crate::config_parser::parser_models::Style;
|
| 7 |
|
| 8 |
+
/// A named struct to store, serialize and deserializes the individual search result from all the
|
| 9 |
+
/// scraped and aggregated search results from the upstream search engines.
|
| 10 |
///
|
| 11 |
/// # Fields
|
| 12 |
///
|
|
|
|
| 16 |
/// * `url` - The url to be displayed below the search result title in html.
|
| 17 |
/// * `description` - The description of the search result.
|
| 18 |
/// * `engine` - The names of the upstream engines from which this results were provided.
|
| 19 |
+
#[derive(Debug, Serialize, Deserialize)]
|
| 20 |
#[serde(rename_all = "camelCase")]
|
| 21 |
pub struct SearchResult {
|
| 22 |
pub title: String,
|
|
|
|
| 116 |
}
|
| 117 |
}
|
| 118 |
|
| 119 |
+
/// A named struct to store, serialize, deserialize the all the search results scraped and
|
| 120 |
+
/// aggregated from the upstream search engines.
|
| 121 |
///
|
| 122 |
/// # Fields
|
| 123 |
///
|
| 124 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
| 125 |
/// `SearchResult` structs.
|
| 126 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
| 127 |
+
#[derive(Serialize, Deserialize)]
|
| 128 |
#[serde(rename_all = "camelCase")]
|
| 129 |
pub struct SearchResults {
|
| 130 |
pub results: Vec<SearchResult>,
|
src/search_results_handler/aggregator.rs
CHANGED
|
@@ -25,7 +25,7 @@ use crate::engines::{duckduckgo, searx};
|
|
| 25 |
/// # Arguments
|
| 26 |
///
|
| 27 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
| 28 |
-
/// * `page` - Accepts an
|
| 29 |
///
|
| 30 |
/// # Error
|
| 31 |
///
|
|
@@ -34,7 +34,7 @@ use crate::engines::{duckduckgo, searx};
|
|
| 34 |
/// containing appropriate values.
|
| 35 |
pub async fn aggregate(
|
| 36 |
query: &str,
|
| 37 |
-
page:
|
| 38 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
| 39 |
let user_agent: String = random_user_agent();
|
| 40 |
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
|
|
|
| 25 |
/// # Arguments
|
| 26 |
///
|
| 27 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
| 28 |
+
/// * `page` - Accepts an u32 page number.
|
| 29 |
///
|
| 30 |
/// # Error
|
| 31 |
///
|
|
|
|
| 34 |
/// containing appropriate values.
|
| 35 |
pub async fn aggregate(
|
| 36 |
query: &str,
|
| 37 |
+
page: u32,
|
| 38 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
| 39 |
let user_agent: String = random_user_agent();
|
| 40 |
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
src/server/routes.rs
CHANGED
|
@@ -4,7 +4,11 @@
|
|
| 4 |
|
| 5 |
use std::fs::read_to_string;
|
| 6 |
|
| 7 |
-
use crate::{
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
use actix_web::{get, web, HttpRequest, HttpResponse};
|
| 9 |
use handlebars::Handlebars;
|
| 10 |
use serde::Deserialize;
|
|
@@ -67,6 +71,9 @@ pub async fn search(
|
|
| 67 |
config: web::Data<Config>,
|
| 68 |
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
| 69 |
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
|
|
|
|
|
|
|
|
|
| 70 |
match ¶ms.q {
|
| 71 |
Some(query) => {
|
| 72 |
if query.trim().is_empty() {
|
|
@@ -74,11 +81,63 @@ pub async fn search(
|
|
| 74 |
.insert_header(("location", "/"))
|
| 75 |
.finish())
|
| 76 |
} else {
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
}
|
| 84 |
None => Ok(HttpResponse::Found()
|
|
@@ -115,6 +174,3 @@ pub async fn settings(
|
|
| 115 |
let page_content: String = hbs.render("settings", &config.style)?;
|
| 116 |
Ok(HttpResponse::Ok().body(page_content))
|
| 117 |
}
|
| 118 |
-
|
| 119 |
-
// TODO: Write tests for tesing parameters for search function that if provided with something
|
| 120 |
-
// other than u32 like alphabets and special characters than it should panic
|
|
|
|
| 4 |
|
| 5 |
use std::fs::read_to_string;
|
| 6 |
|
| 7 |
+
use crate::{
|
| 8 |
+
cache::cacher::RedisCache,
|
| 9 |
+
config_parser::parser::Config,
|
| 10 |
+
search_results_handler::{aggregation_models::SearchResults, aggregator::aggregate},
|
| 11 |
+
};
|
| 12 |
use actix_web::{get, web, HttpRequest, HttpResponse};
|
| 13 |
use handlebars::Handlebars;
|
| 14 |
use serde::Deserialize;
|
|
|
|
| 71 |
config: web::Data<Config>,
|
| 72 |
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
| 73 |
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
| 74 |
+
|
| 75 |
+
//Initialize redis cache connection struct
|
| 76 |
+
let redis_cache = RedisCache::new(config.redis_connection_url.clone());
|
| 77 |
match ¶ms.q {
|
| 78 |
Some(query) => {
|
| 79 |
if query.trim().is_empty() {
|
|
|
|
| 81 |
.insert_header(("location", "/"))
|
| 82 |
.finish())
|
| 83 |
} else {
|
| 84 |
+
// Initialize the page url as an empty string
|
| 85 |
+
let mut page_url = String::new();
|
| 86 |
+
|
| 87 |
+
// Find whether the page is valid page number if not then return
|
| 88 |
+
// the first page number and also construct the page_url accordingly
|
| 89 |
+
let page = match params.page {
|
| 90 |
+
Some(page_number) => {
|
| 91 |
+
if page_number <= 1 {
|
| 92 |
+
page_url = format!(
|
| 93 |
+
"http://{}:{}/search?q={}&page={}",
|
| 94 |
+
config.binding_ip_addr, config.port, query, 1
|
| 95 |
+
);
|
| 96 |
+
1
|
| 97 |
+
} else {
|
| 98 |
+
page_url = format!(
|
| 99 |
+
"http://{}:{}/search?q={}&page={}",
|
| 100 |
+
config.binding_ip_addr, config.port, query, page_number
|
| 101 |
+
);
|
| 102 |
+
|
| 103 |
+
page_number
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
None => {
|
| 107 |
+
page_url = format!(
|
| 108 |
+
"http://{}:{}{}&page={}",
|
| 109 |
+
config.binding_ip_addr,
|
| 110 |
+
config.port,
|
| 111 |
+
req.uri(),
|
| 112 |
+
1
|
| 113 |
+
);
|
| 114 |
+
|
| 115 |
+
1
|
| 116 |
+
}
|
| 117 |
+
};
|
| 118 |
+
|
| 119 |
+
// fetch the cached results json.
|
| 120 |
+
let cached_results_json = redis_cache.clone().cached_results_json(page_url.clone());
|
| 121 |
+
// check if fetched results was indeed fetched or it was an error and if so
|
| 122 |
+
// handle the data accordingly.
|
| 123 |
+
match cached_results_json {
|
| 124 |
+
Ok(results_json) => {
|
| 125 |
+
let new_results_json: SearchResults = serde_json::from_str(&results_json)?;
|
| 126 |
+
let page_content: String = hbs.render("search", &new_results_json)?;
|
| 127 |
+
Ok(HttpResponse::Ok().body(page_content))
|
| 128 |
+
}
|
| 129 |
+
Err(_) => {
|
| 130 |
+
let mut results_json: crate::search_results_handler::aggregation_models::SearchResults =
|
| 131 |
+
aggregate(query, page).await?;
|
| 132 |
+
results_json.add_style(config.style.clone());
|
| 133 |
+
redis_cache.clone().cache_results(
|
| 134 |
+
serde_json::to_string(&results_json)?,
|
| 135 |
+
page_url.clone(),
|
| 136 |
+
)?;
|
| 137 |
+
let page_content: String = hbs.render("search", &results_json)?;
|
| 138 |
+
Ok(HttpResponse::Ok().body(page_content))
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
}
|
| 142 |
}
|
| 143 |
None => Ok(HttpResponse::Found()
|
|
|
|
| 174 |
let page_content: String = hbs.render("settings", &config.style)?;
|
| 175 |
Ok(HttpResponse::Ok().body(page_content))
|
| 176 |
}
|
|
|
|
|
|
|
|
|
tests/index.rs
CHANGED
|
@@ -41,3 +41,5 @@ async fn test_index() {
|
|
| 41 |
assert_eq!(res.text().await.unwrap(), template);
|
| 42 |
}
|
| 43 |
|
|
|
|
|
|
|
|
|
| 41 |
assert_eq!(res.text().await.unwrap(), template);
|
| 42 |
}
|
| 43 |
|
| 44 |
+
// TODO: Write tests for tesing parameters for search function that if provided with something
|
| 45 |
+
// other than u32 like alphabets and special characters than it should panic
|
websurfx/config.lua
CHANGED
|
@@ -16,3 +16,6 @@ binding_ip_addr = "127.0.0.1" --ip address on the which server should be launche
|
|
| 16 |
-- }}
|
| 17 |
colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
|
| 18 |
theme = "simple" -- the theme name which should be used for the website
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
-- }}
|
| 17 |
colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
|
| 18 |
theme = "simple" -- the theme name which should be used for the website
|
| 19 |
+
|
| 20 |
+
-- Caching
|
| 21 |
+
redis_connection_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
|