2020-07-14 18:00:09 +02:00
|
|
|
use std::{
|
2022-02-22 20:48:00 +01:00
|
|
|
net::IpAddr,
|
|
|
|
sync::Arc,
|
2020-07-14 18:00:09 +02:00
|
|
|
time::{Duration, SystemTime},
|
|
|
|
};
|
2019-01-31 15:49:58 +01:00
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
use bytes::{Bytes, BytesMut};
|
2021-11-07 18:53:39 +01:00
|
|
|
use futures::{stream::StreamExt, TryFutureExt};
|
2020-07-14 18:00:09 +02:00
|
|
|
use once_cell::sync::Lazy;
|
2019-01-27 15:39:19 +01:00
|
|
|
use regex::Regex;
|
2022-02-22 20:48:00 +01:00
|
|
|
use reqwest::{
|
|
|
|
header::{self, HeaderMap, HeaderValue},
|
|
|
|
Client, Response,
|
|
|
|
};
|
2021-11-07 18:53:39 +01:00
|
|
|
use rocket::{http::ContentType, response::Redirect, Route};
|
|
|
|
use tokio::{
|
|
|
|
fs::{create_dir_all, remove_file, symlink_metadata, File},
|
|
|
|
io::{AsyncReadExt, AsyncWriteExt},
|
2022-02-22 20:48:00 +01:00
|
|
|
net::lookup_host,
|
2021-12-20 10:34:31 +01:00
|
|
|
};
|
2019-01-27 15:39:19 +01:00
|
|
|
|
2023-08-04 16:50:38 +02:00
|
|
|
use html5gum::{Emitter, HtmlString, InfallibleTokenizer, Readable, StringReader, Tokenizer};
|
2022-02-22 20:48:00 +01:00
|
|
|
|
2021-04-15 18:30:23 +02:00
|
|
|
use crate::{
|
|
|
|
error::Error,
|
|
|
|
util::{get_reqwest_client_builder, Cached},
|
|
|
|
CONFIG,
|
|
|
|
};
|
2019-01-27 15:39:19 +01:00
|
|
|
|
2018-02-10 01:00:55 +01:00
|
|
|
pub fn routes() -> Vec<Route> {
|
2021-12-20 10:34:31 +01:00
|
|
|
match CONFIG.icon_service().as_str() {
|
|
|
|
"internal" => routes![icon_internal],
|
2022-07-17 16:21:03 +02:00
|
|
|
_ => routes![icon_external],
|
2021-12-20 10:34:31 +01:00
|
|
|
}
|
2018-02-10 01:00:55 +01:00
|
|
|
}
|
|
|
|
|
2020-03-09 22:04:03 +01:00
|
|
|
static CLIENT: Lazy<Client> = Lazy::new(|| {
|
2020-12-10 23:13:24 +01:00
|
|
|
// Generate the default headers
|
2022-02-22 20:48:00 +01:00
|
|
|
let mut default_headers = HeaderMap::new();
|
|
|
|
default_headers.insert(header::USER_AGENT, HeaderValue::from_static("Links (2.22; Linux X86_64; GNU C; text)"));
|
|
|
|
default_headers.insert(header::ACCEPT, HeaderValue::from_static("text/html, text/*;q=0.5, image/*, */*;q=0.1"));
|
|
|
|
default_headers.insert(header::ACCEPT_LANGUAGE, HeaderValue::from_static("en,*;q=0.1"));
|
|
|
|
default_headers.insert(header::CACHE_CONTROL, HeaderValue::from_static("no-cache"));
|
|
|
|
default_headers.insert(header::PRAGMA, HeaderValue::from_static("no-cache"));
|
|
|
|
|
|
|
|
// Generate the cookie store
|
|
|
|
let cookie_store = Arc::new(Jar::default());
|
2020-12-10 23:13:24 +01:00
|
|
|
|
2023-08-04 16:50:38 +02:00
|
|
|
let icon_download_timeout = Duration::from_secs(CONFIG.icon_download_timeout());
|
|
|
|
let pool_idle_timeout = Duration::from_secs(10);
|
2019-01-29 21:20:59 +01:00
|
|
|
// Reuse the client between requests
|
2022-02-22 20:48:00 +01:00
|
|
|
let client = get_reqwest_client_builder()
|
2022-07-10 16:39:38 +02:00
|
|
|
.cookie_provider(Arc::clone(&cookie_store))
|
2023-08-04 16:50:38 +02:00
|
|
|
.timeout(icon_download_timeout)
|
|
|
|
.pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections
|
|
|
|
.pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds
|
|
|
|
.trust_dns(true)
|
2022-02-22 20:48:00 +01:00
|
|
|
.default_headers(default_headers.clone());
|
|
|
|
|
|
|
|
match client.build() {
|
|
|
|
Ok(client) => client,
|
|
|
|
Err(e) => {
|
|
|
|
error!("Possible trust-dns error, trying with trust-dns disabled: '{e}'");
|
|
|
|
get_reqwest_client_builder()
|
|
|
|
.cookie_provider(cookie_store)
|
2023-08-04 16:50:38 +02:00
|
|
|
.timeout(icon_download_timeout)
|
|
|
|
.pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections
|
|
|
|
.pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds
|
2022-02-22 20:48:00 +01:00
|
|
|
.trust_dns(false)
|
2023-08-04 16:50:38 +02:00
|
|
|
.default_headers(default_headers)
|
2022-02-22 20:48:00 +01:00
|
|
|
.build()
|
|
|
|
.expect("Failed to build client")
|
|
|
|
}
|
|
|
|
}
|
2020-03-09 22:04:03 +01:00
|
|
|
});
|
2019-01-29 21:20:59 +01:00
|
|
|
|
2020-12-08 17:33:15 +01:00
|
|
|
// Build Regex only once since this takes a lot of time.
|
2020-07-14 21:58:27 +02:00
|
|
|
static ICON_SIZE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?x)(\d+)\D*(\d+)").unwrap());
|
|
|
|
|
2020-12-08 17:33:15 +01:00
|
|
|
// Special HashMap which holds the user defined Regex to speedup matching the regex.
|
2022-05-20 23:49:05 +02:00
|
|
|
static ICON_BLACKLIST_REGEX: Lazy<dashmap::DashMap<String, Regex>> = Lazy::new(dashmap::DashMap::new);
|
2020-12-08 17:33:15 +01:00
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
async fn icon_redirect(domain: &str, template: &str) -> Option<Redirect> {
|
2022-07-10 16:39:38 +02:00
|
|
|
if !is_valid_domain(domain) {
|
2021-12-20 10:34:31 +01:00
|
|
|
warn!("Invalid domain: {}", domain);
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2023-02-07 11:24:23 +01:00
|
|
|
if check_domain_blacklist_reason(domain).await.is_some() {
|
2021-12-20 10:34:31 +01:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
let url = template.replace("{}", domain);
|
2021-12-30 03:01:32 +01:00
|
|
|
match CONFIG.icon_redirect_code() {
|
2022-01-09 08:40:35 +01:00
|
|
|
301 => Some(Redirect::moved(url)), // legacy permanent redirect
|
|
|
|
302 => Some(Redirect::found(url)), // legacy temporary redirect
|
2021-12-30 03:01:32 +01:00
|
|
|
307 => Some(Redirect::temporary(url)),
|
2022-01-09 08:40:35 +01:00
|
|
|
308 => Some(Redirect::permanent(url)),
|
2021-12-30 03:01:32 +01:00
|
|
|
_ => {
|
|
|
|
error!("Unexpected redirect code {}", CONFIG.icon_redirect_code());
|
|
|
|
None
|
|
|
|
}
|
|
|
|
}
|
2021-12-20 10:34:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#[get("/<domain>/icon.png")]
|
2023-04-30 17:18:12 +02:00
|
|
|
async fn icon_external(domain: &str) -> Option<Redirect> {
|
|
|
|
icon_redirect(domain, &CONFIG._icon_service_url()).await
|
2021-12-20 10:34:31 +01:00
|
|
|
}
|
|
|
|
|
2020-12-08 17:33:15 +01:00
|
|
|
#[get("/<domain>/icon.png")]
|
2023-04-30 17:18:12 +02:00
|
|
|
async fn icon_internal(domain: &str) -> Cached<(ContentType, Vec<u8>)> {
|
2021-03-29 10:27:58 +02:00
|
|
|
const FALLBACK_ICON: &[u8] = include_bytes!("../static/images/fallback-icon.png");
|
|
|
|
|
2023-04-30 17:18:12 +02:00
|
|
|
if !is_valid_domain(domain) {
|
2020-12-08 17:33:15 +01:00
|
|
|
warn!("Invalid domain: {}", domain);
|
2021-03-31 22:18:35 +02:00
|
|
|
return Cached::ttl(
|
2021-11-07 18:53:39 +01:00
|
|
|
(ContentType::new("image", "png"), FALLBACK_ICON.to_vec()),
|
2021-03-31 22:18:35 +02:00
|
|
|
CONFIG.icon_cache_negttl(),
|
2021-12-28 17:24:42 +01:00
|
|
|
true,
|
2021-03-31 22:18:35 +02:00
|
|
|
);
|
2020-12-08 17:33:15 +01:00
|
|
|
}
|
|
|
|
|
2023-04-30 17:18:12 +02:00
|
|
|
match get_icon(domain).await {
|
2021-04-03 22:51:44 +02:00
|
|
|
Some((icon, icon_type)) => {
|
2021-11-07 18:53:39 +01:00
|
|
|
Cached::ttl((ContentType::new("image", icon_type), icon), CONFIG.icon_cache_ttl(), true)
|
2021-04-06 22:55:28 +02:00
|
|
|
}
|
2021-11-07 18:53:39 +01:00
|
|
|
_ => Cached::ttl((ContentType::new("image", "png"), FALLBACK_ICON.to_vec()), CONFIG.icon_cache_negttl(), true),
|
2021-03-29 10:27:58 +02:00
|
|
|
}
|
2020-12-08 17:33:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns if the domain provided is valid or not.
|
|
|
|
///
|
|
|
|
/// This does some manual checks and makes use of Url to do some basic checking.
|
|
|
|
/// domains can't be larger then 63 characters (not counting multiple subdomains) according to the RFC's, but we limit the total size to 255.
|
2022-07-10 16:39:38 +02:00
|
|
|
fn is_valid_domain(domain: &str) -> bool {
|
2021-03-29 10:27:58 +02:00
|
|
|
const ALLOWED_CHARS: &str = "_-.";
|
|
|
|
|
2020-12-08 17:33:15 +01:00
|
|
|
// If parsing the domain fails using Url, it will not work with reqwest.
|
2022-12-29 14:11:52 +01:00
|
|
|
if let Err(parse_error) = url::Url::parse(format!("https://{domain}").as_str()) {
|
2020-12-08 17:33:15 +01:00
|
|
|
debug!("Domain parse error: '{}' - {:?}", domain, parse_error);
|
|
|
|
return false;
|
|
|
|
} else if domain.is_empty()
|
|
|
|
|| domain.contains("..")
|
|
|
|
|| domain.starts_with('.')
|
|
|
|
|| domain.starts_with('-')
|
|
|
|
|| domain.ends_with('-')
|
|
|
|
{
|
2021-03-31 22:18:35 +02:00
|
|
|
debug!(
|
|
|
|
"Domain validation error: '{}' is either empty, contains '..', starts with an '.', starts or ends with a '-'",
|
|
|
|
domain
|
|
|
|
);
|
2020-12-08 17:33:15 +01:00
|
|
|
return false;
|
|
|
|
} else if domain.len() > 255 {
|
|
|
|
debug!("Domain validation error: '{}' exceeds 255 characters", domain);
|
2019-03-18 22:12:39 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
for c in domain.chars() {
|
|
|
|
if !c.is_alphanumeric() && !ALLOWED_CHARS.contains(c) {
|
2021-04-06 22:54:42 +02:00
|
|
|
debug!("Domain validation error: '{}' contains an invalid character '{}'", domain, c);
|
2019-03-18 22:12:39 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
true
|
|
|
|
}
|
|
|
|
|
2020-07-14 23:44:01 +02:00
|
|
|
/// TODO: This is extracted from IpAddr::is_global, which is unstable:
|
|
|
|
/// https://doc.rust-lang.org/nightly/std/net/enum.IpAddr.html#method.is_global
|
|
|
|
/// Remove once https://github.com/rust-lang/rust/issues/27709 is merged
|
2020-12-08 17:33:15 +01:00
|
|
|
#[allow(clippy::nonminimal_bool)]
|
2020-07-14 23:44:01 +02:00
|
|
|
#[cfg(not(feature = "unstable"))]
|
|
|
|
fn is_global(ip: IpAddr) -> bool {
|
|
|
|
match ip {
|
|
|
|
IpAddr::V4(ip) => {
|
|
|
|
// check if this address is 192.0.0.9 or 192.0.0.10. These addresses are the only two
|
|
|
|
// globally routable addresses in the 192.0.0.0/24 range.
|
|
|
|
if u32::from(ip) == 0xc0000009 || u32::from(ip) == 0xc000000a {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
!ip.is_private()
|
|
|
|
&& !ip.is_loopback()
|
|
|
|
&& !ip.is_link_local()
|
|
|
|
&& !ip.is_broadcast()
|
|
|
|
&& !ip.is_documentation()
|
|
|
|
&& !(ip.octets()[0] == 100 && (ip.octets()[1] & 0b1100_0000 == 0b0100_0000))
|
|
|
|
&& !(ip.octets()[0] == 192 && ip.octets()[1] == 0 && ip.octets()[2] == 0)
|
|
|
|
&& !(ip.octets()[0] & 240 == 240 && !ip.is_broadcast())
|
|
|
|
&& !(ip.octets()[0] == 198 && (ip.octets()[1] & 0xfe) == 18)
|
|
|
|
// Make sure the address is not in 0.0.0.0/8
|
|
|
|
&& ip.octets()[0] != 0
|
|
|
|
}
|
|
|
|
IpAddr::V6(ip) => {
|
|
|
|
if ip.is_multicast() && ip.segments()[0] & 0x000f == 14 {
|
|
|
|
true
|
|
|
|
} else {
|
|
|
|
!ip.is_multicast()
|
|
|
|
&& !ip.is_loopback()
|
|
|
|
&& !((ip.segments()[0] & 0xffc0) == 0xfe80)
|
|
|
|
&& !((ip.segments()[0] & 0xfe00) == 0xfc00)
|
|
|
|
&& !ip.is_unspecified()
|
|
|
|
&& !((ip.segments()[0] == 0x2001) && (ip.segments()[1] == 0xdb8))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(feature = "unstable")]
|
|
|
|
fn is_global(ip: IpAddr) -> bool {
|
|
|
|
ip.is_global()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// These are some tests to check that the implementations match
|
|
|
|
/// The IPv4 can be all checked in 5 mins or so and they are correct as of nightly 2020-07-11
|
|
|
|
/// The IPV6 can't be checked in a reasonable time, so we check about ten billion random ones, so far correct
|
|
|
|
/// Note that the is_global implementation is subject to change as new IP RFCs are created
|
|
|
|
///
|
|
|
|
/// To run while showing progress output:
|
|
|
|
/// cargo test --features sqlite,unstable -- --nocapture --ignored
|
|
|
|
#[cfg(test)]
|
|
|
|
#[cfg(feature = "unstable")]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[ignore]
|
|
|
|
fn test_ipv4_global() {
|
|
|
|
for a in 0..u8::MAX {
|
|
|
|
println!("Iter: {}/255", a);
|
|
|
|
for b in 0..u8::MAX {
|
|
|
|
for c in 0..u8::MAX {
|
|
|
|
for d in 0..u8::MAX {
|
|
|
|
let ip = IpAddr::V4(std::net::Ipv4Addr::new(a, b, c, d));
|
|
|
|
assert_eq!(ip.is_global(), is_global(ip))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[ignore]
|
|
|
|
fn test_ipv6_global() {
|
|
|
|
use ring::rand::{SecureRandom, SystemRandom};
|
|
|
|
let mut v = [0u8; 16];
|
|
|
|
let rand = SystemRandom::new();
|
|
|
|
for i in 0..1_000 {
|
|
|
|
println!("Iter: {}/1_000", i);
|
|
|
|
for _ in 0..10_000_000 {
|
|
|
|
rand.fill(&mut v).expect("Error generating random values");
|
|
|
|
let ip = IpAddr::V6(std::net::Ipv6Addr::new(
|
|
|
|
(v[14] as u16) << 8 | v[15] as u16,
|
|
|
|
(v[12] as u16) << 8 | v[13] as u16,
|
|
|
|
(v[10] as u16) << 8 | v[11] as u16,
|
|
|
|
(v[8] as u16) << 8 | v[9] as u16,
|
|
|
|
(v[6] as u16) << 8 | v[7] as u16,
|
|
|
|
(v[4] as u16) << 8 | v[5] as u16,
|
|
|
|
(v[2] as u16) << 8 | v[3] as u16,
|
|
|
|
(v[0] as u16) << 8 | v[1] as u16,
|
|
|
|
));
|
|
|
|
assert_eq!(ip.is_global(), is_global(ip))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-04 16:50:38 +02:00
|
|
|
#[derive(Clone)]
|
2023-02-07 11:24:23 +01:00
|
|
|
enum DomainBlacklistReason {
|
|
|
|
Regex,
|
|
|
|
IP,
|
|
|
|
}
|
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
use cached::proc_macro::cached;
|
2023-02-07 11:54:06 +01:00
|
|
|
#[cached(key = "String", convert = r#"{ domain.to_string() }"#, size = 16, time = 60)]
|
2023-02-07 11:24:23 +01:00
|
|
|
async fn check_domain_blacklist_reason(domain: &str) -> Option<DomainBlacklistReason> {
|
2022-11-14 17:25:44 +01:00
|
|
|
// First check the blacklist regex if there is a match.
|
|
|
|
// This prevents the blocked domain(s) from being leaked via a DNS lookup.
|
2022-02-22 20:48:00 +01:00
|
|
|
if let Some(blacklist) = CONFIG.icon_blacklist_regex() {
|
|
|
|
// Use the pre-generate Regex stored in a Lazy HashMap if there's one, else generate it.
|
2022-05-20 23:49:05 +02:00
|
|
|
let is_match = if let Some(regex) = ICON_BLACKLIST_REGEX.get(&blacklist) {
|
|
|
|
regex.is_match(domain)
|
2022-02-22 20:48:00 +01:00
|
|
|
} else {
|
|
|
|
// Clear the current list if the previous key doesn't exists.
|
|
|
|
// To prevent growing of the HashMap after someone has changed it via the admin interface.
|
2022-05-20 23:49:05 +02:00
|
|
|
if ICON_BLACKLIST_REGEX.len() >= 1 {
|
|
|
|
ICON_BLACKLIST_REGEX.clear();
|
2019-10-05 14:48:15 +02:00
|
|
|
}
|
2022-02-22 20:48:00 +01:00
|
|
|
|
|
|
|
// Generate the regex to store in too the Lazy Static HashMap.
|
2022-05-20 23:49:05 +02:00
|
|
|
let blacklist_regex = Regex::new(&blacklist).unwrap();
|
|
|
|
let is_match = blacklist_regex.is_match(domain);
|
2022-07-10 16:39:38 +02:00
|
|
|
ICON_BLACKLIST_REGEX.insert(blacklist.clone(), blacklist_regex);
|
2022-02-22 20:48:00 +01:00
|
|
|
|
2022-05-20 23:49:05 +02:00
|
|
|
is_match
|
2022-02-22 20:48:00 +01:00
|
|
|
};
|
|
|
|
|
2022-05-20 23:49:05 +02:00
|
|
|
if is_match {
|
2022-02-22 20:48:00 +01:00
|
|
|
debug!("Blacklisted domain: {} matched ICON_BLACKLIST_REGEX", domain);
|
2023-02-07 11:24:23 +01:00
|
|
|
return Some(DomainBlacklistReason::Regex);
|
2019-10-05 14:48:15 +02:00
|
|
|
}
|
|
|
|
}
|
2022-11-14 17:25:44 +01:00
|
|
|
|
|
|
|
if CONFIG.icon_blacklist_non_global_ips() {
|
|
|
|
if let Ok(s) = lookup_host((domain, 0)).await {
|
|
|
|
for addr in s {
|
|
|
|
if !is_global(addr.ip()) {
|
|
|
|
debug!("IP {} for domain '{}' is not a global IP!", addr.ip(), domain);
|
2023-02-07 11:24:23 +01:00
|
|
|
return Some(DomainBlacklistReason::IP);
|
2022-11-14 17:25:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-07 11:24:23 +01:00
|
|
|
None
|
2019-10-05 14:48:15 +02:00
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn get_icon(domain: &str) -> Option<(Vec<u8>, String)> {
|
2019-01-25 18:23:51 +01:00
|
|
|
let path = format!("{}/{}.png", CONFIG.icon_cache_folder(), domain);
|
2018-02-10 01:00:55 +01:00
|
|
|
|
2020-11-10 03:50:35 +01:00
|
|
|
// Check for expiration of negatively cached copy
|
2021-11-07 18:53:39 +01:00
|
|
|
if icon_is_negcached(&path).await {
|
2020-11-10 03:50:35 +01:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
if let Some(icon) = get_cached_icon(&path).await {
|
2022-07-10 16:39:38 +02:00
|
|
|
let icon_type = match get_icon_type(&icon) {
|
2021-04-03 22:51:44 +02:00
|
|
|
Some(x) => x,
|
|
|
|
_ => "x-icon",
|
|
|
|
};
|
|
|
|
return Some((icon, icon_type.to_string()));
|
2018-06-12 21:09:42 +02:00
|
|
|
}
|
2018-02-10 01:00:55 +01:00
|
|
|
|
2019-01-28 23:58:32 +01:00
|
|
|
if CONFIG.disable_icon_download() {
|
2020-11-10 02:45:58 +01:00
|
|
|
return None;
|
2019-01-28 23:58:32 +01:00
|
|
|
}
|
|
|
|
|
2020-11-10 02:45:58 +01:00
|
|
|
// Get the icon, or None in case of error
|
2021-11-07 18:53:39 +01:00
|
|
|
match download_icon(domain).await {
|
2021-04-03 22:51:44 +02:00
|
|
|
Ok((icon, icon_type)) => {
|
2021-11-07 18:53:39 +01:00
|
|
|
save_icon(&path, &icon).await;
|
|
|
|
Some((icon.to_vec(), icon_type.unwrap_or("x-icon").to_string()))
|
2018-12-19 00:57:45 +01:00
|
|
|
}
|
2018-07-01 15:27:42 +02:00
|
|
|
Err(e) => {
|
2021-12-24 18:24:25 +01:00
|
|
|
warn!("Unable to download icon: {:?}", e);
|
2019-11-06 20:34:52 +01:00
|
|
|
let miss_indicator = path + ".miss";
|
2021-11-07 18:53:39 +01:00
|
|
|
save_icon(&miss_indicator, &[]).await;
|
2020-11-10 02:45:58 +01:00
|
|
|
None
|
2018-07-01 15:27:42 +02:00
|
|
|
}
|
2018-06-12 21:09:42 +02:00
|
|
|
}
|
2018-02-10 01:00:55 +01:00
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn get_cached_icon(path: &str) -> Option<Vec<u8>> {
|
2018-12-18 22:33:32 +01:00
|
|
|
// Check for expiration of successfully cached copy
|
2021-11-07 18:53:39 +01:00
|
|
|
if icon_is_expired(path).await {
|
2018-12-19 00:57:45 +01:00
|
|
|
return None;
|
2018-12-18 22:33:32 +01:00
|
|
|
}
|
|
|
|
|
2018-02-15 00:53:11 +01:00
|
|
|
// Try to read the cached icon, and return it if it exists
|
2021-11-07 18:53:39 +01:00
|
|
|
if let Ok(mut f) = File::open(path).await {
|
2018-06-11 15:44:37 +02:00
|
|
|
let mut buffer = Vec::new();
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
if f.read_to_end(&mut buffer).await.is_ok() {
|
2018-06-12 21:09:42 +02:00
|
|
|
return Some(buffer);
|
2018-02-10 01:00:55 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-12 21:09:42 +02:00
|
|
|
None
|
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn file_is_expired(path: &str, ttl: u64) -> Result<bool, Error> {
|
|
|
|
let meta = symlink_metadata(path).await?;
|
2018-12-18 22:33:32 +01:00
|
|
|
let modified = meta.modified()?;
|
|
|
|
let age = SystemTime::now().duration_since(modified)?;
|
|
|
|
|
|
|
|
Ok(ttl > 0 && ttl <= age.as_secs())
|
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn icon_is_negcached(path: &str) -> bool {
|
2018-12-18 22:33:32 +01:00
|
|
|
let miss_indicator = path.to_owned() + ".miss";
|
2021-11-07 18:53:39 +01:00
|
|
|
let expired = file_is_expired(&miss_indicator, CONFIG.icon_cache_negttl()).await;
|
2018-12-19 00:57:45 +01:00
|
|
|
|
2018-12-18 22:33:32 +01:00
|
|
|
match expired {
|
|
|
|
// No longer negatively cached, drop the marker
|
|
|
|
Ok(true) => {
|
2021-11-07 18:53:39 +01:00
|
|
|
if let Err(e) = remove_file(&miss_indicator).await {
|
2018-12-19 00:57:45 +01:00
|
|
|
error!("Could not remove negative cache indicator for icon {:?}: {:?}", path, e);
|
2018-12-18 22:33:32 +01:00
|
|
|
}
|
|
|
|
false
|
2018-12-19 00:57:45 +01:00
|
|
|
}
|
2018-12-18 22:33:32 +01:00
|
|
|
// The marker hasn't expired yet.
|
2018-12-19 00:57:45 +01:00
|
|
|
Ok(false) => true,
|
2018-12-18 22:33:32 +01:00
|
|
|
// The marker is missing or inaccessible in some way.
|
2018-12-19 00:57:45 +01:00
|
|
|
Err(_) => false,
|
2018-12-18 22:33:32 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn icon_is_expired(path: &str) -> bool {
|
|
|
|
let expired = file_is_expired(path, CONFIG.icon_cache_ttl()).await;
|
2018-12-18 22:33:32 +01:00
|
|
|
expired.unwrap_or(true)
|
|
|
|
}
|
|
|
|
|
2019-03-18 22:12:39 +01:00
|
|
|
struct Icon {
|
2019-01-28 23:58:32 +01:00
|
|
|
priority: u8,
|
|
|
|
href: String,
|
|
|
|
}
|
|
|
|
|
2019-03-18 22:12:39 +01:00
|
|
|
impl Icon {
|
2020-05-03 17:24:51 +02:00
|
|
|
const fn new(priority: u8, href: String) -> Self {
|
2021-04-06 22:54:42 +02:00
|
|
|
Self {
|
|
|
|
priority,
|
2021-05-02 17:49:25 +02:00
|
|
|
href,
|
2021-04-06 22:54:42 +02:00
|
|
|
}
|
2019-03-18 22:12:39 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-10 16:39:38 +02:00
|
|
|
fn get_favicons_node(
|
2022-02-22 20:48:00 +01:00
|
|
|
dom: InfallibleTokenizer<StringReader<'_>, FaviconEmitter>,
|
|
|
|
icons: &mut Vec<Icon>,
|
|
|
|
url: &url::Url,
|
|
|
|
) {
|
|
|
|
const TAG_LINK: &[u8] = b"link";
|
|
|
|
const TAG_BASE: &[u8] = b"base";
|
|
|
|
const TAG_HEAD: &[u8] = b"head";
|
|
|
|
const ATTR_HREF: &[u8] = b"href";
|
|
|
|
const ATTR_SIZES: &[u8] = b"sizes";
|
|
|
|
|
|
|
|
let mut base_url = url.clone();
|
2023-08-04 16:50:38 +02:00
|
|
|
let mut icon_tags: Vec<Tag> = Vec::new();
|
2022-02-22 20:48:00 +01:00
|
|
|
for token in dom {
|
2023-08-04 16:50:38 +02:00
|
|
|
let tag_name: &[u8] = &token.tag.name;
|
|
|
|
match tag_name {
|
|
|
|
TAG_LINK => {
|
|
|
|
icon_tags.push(token.tag);
|
|
|
|
}
|
|
|
|
TAG_BASE => {
|
|
|
|
base_url = if let Some(href) = token.tag.attributes.get(ATTR_HREF) {
|
|
|
|
let href = std::str::from_utf8(href).unwrap_or_default();
|
2022-02-22 20:48:00 +01:00
|
|
|
debug!("Found base href: {href}");
|
2023-08-04 16:50:38 +02:00
|
|
|
match base_url.join(href) {
|
2022-02-22 20:48:00 +01:00
|
|
|
Ok(inner_url) => inner_url,
|
2023-08-04 16:50:38 +02:00
|
|
|
_ => continue,
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
continue;
|
|
|
|
};
|
2021-02-07 22:28:02 +01:00
|
|
|
}
|
2023-08-04 16:50:38 +02:00
|
|
|
TAG_HEAD if token.closing => {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
_ => {
|
|
|
|
continue;
|
2021-02-07 22:28:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
for icon_tag in icon_tags {
|
|
|
|
if let Some(icon_href) = icon_tag.attributes.get(ATTR_HREF) {
|
|
|
|
if let Ok(full_href) = base_url.join(std::str::from_utf8(icon_href).unwrap_or_default()) {
|
|
|
|
let sizes = if let Some(v) = icon_tag.attributes.get(ATTR_SIZES) {
|
|
|
|
std::str::from_utf8(v).unwrap_or_default()
|
|
|
|
} else {
|
|
|
|
""
|
|
|
|
};
|
2022-07-10 16:39:38 +02:00
|
|
|
let priority = get_icon_priority(full_href.as_str(), sizes);
|
2022-02-22 20:48:00 +01:00
|
|
|
icons.push(Icon::new(priority, full_href.to_string()));
|
|
|
|
}
|
|
|
|
};
|
2021-02-07 22:28:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-10 23:13:24 +01:00
|
|
|
struct IconUrlResult {
|
|
|
|
iconlist: Vec<Icon>,
|
|
|
|
referer: String,
|
|
|
|
}
|
|
|
|
|
2021-05-16 15:29:13 +02:00
|
|
|
/// Returns a IconUrlResult which holds a Vector IconList and a string which holds the referer.
|
|
|
|
/// There will always two items within the iconlist which holds http(s)://domain.tld/favicon.ico.
|
2019-02-04 16:59:52 +01:00
|
|
|
/// This does not mean that that location does exists, but it is the default location browser use.
|
2019-01-27 15:39:19 +01:00
|
|
|
///
|
|
|
|
/// # Argument
|
|
|
|
/// * `domain` - A string which holds the domain with extension.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
/// ```
|
2022-02-22 20:48:00 +01:00
|
|
|
/// let icon_result = get_icon_url("github.com").await?;
|
|
|
|
/// let icon_result = get_icon_url("vaultwarden.discourse.group").await?;
|
2019-01-27 15:39:19 +01:00
|
|
|
/// ```
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
|
2019-01-27 15:39:19 +01:00
|
|
|
// Default URL with secure and insecure schemes
|
2022-02-22 20:48:00 +01:00
|
|
|
let ssldomain = format!("https://{domain}");
|
|
|
|
let httpdomain = format!("http://{domain}");
|
2019-01-27 15:39:19 +01:00
|
|
|
|
2020-12-08 17:33:15 +01:00
|
|
|
// First check the domain as given during the request for both HTTPS and HTTP.
|
2021-11-07 18:53:39 +01:00
|
|
|
let resp = match get_page(&ssldomain).or_else(|_| get_page(&httpdomain)).await {
|
2020-12-08 17:33:15 +01:00
|
|
|
Ok(c) => Ok(c),
|
|
|
|
Err(e) => {
|
|
|
|
let mut sub_resp = Err(e);
|
|
|
|
|
|
|
|
// When the domain is not an IP, and has more then one dot, remove all subdomains.
|
|
|
|
let is_ip = domain.parse::<IpAddr>();
|
|
|
|
if is_ip.is_err() && domain.matches('.').count() > 1 {
|
|
|
|
let mut domain_parts = domain.split('.');
|
|
|
|
let base_domain = format!(
|
|
|
|
"{base}.{tld}",
|
|
|
|
tld = domain_parts.next_back().unwrap(),
|
|
|
|
base = domain_parts.next_back().unwrap()
|
|
|
|
);
|
2022-07-10 16:39:38 +02:00
|
|
|
if is_valid_domain(&base_domain) {
|
2022-02-22 20:48:00 +01:00
|
|
|
let sslbase = format!("https://{base_domain}");
|
|
|
|
let httpbase = format!("http://{base_domain}");
|
|
|
|
debug!("[get_icon_url]: Trying without subdomains '{base_domain}'");
|
2020-12-08 17:33:15 +01:00
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
sub_resp = get_page(&sslbase).or_else(|_| get_page(&httpbase)).await;
|
2020-12-08 17:33:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// When the domain is not an IP, and has less then 2 dots, try to add www. infront of it.
|
|
|
|
} else if is_ip.is_err() && domain.matches('.').count() < 2 {
|
2022-02-22 20:48:00 +01:00
|
|
|
let www_domain = format!("www.{domain}");
|
2022-07-10 16:39:38 +02:00
|
|
|
if is_valid_domain(&www_domain) {
|
2022-02-22 20:48:00 +01:00
|
|
|
let sslwww = format!("https://{www_domain}");
|
|
|
|
let httpwww = format!("http://{www_domain}");
|
|
|
|
debug!("[get_icon_url]: Trying with www. prefix '{www_domain}'");
|
2020-12-08 17:33:15 +01:00
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
sub_resp = get_page(&sslwww).or_else(|_| get_page(&httpwww)).await;
|
2020-12-08 17:33:15 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
sub_resp
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-12-10 23:13:24 +01:00
|
|
|
// Create the iconlist
|
|
|
|
let mut iconlist: Vec<Icon> = Vec::new();
|
2022-11-04 12:56:02 +01:00
|
|
|
let mut referer = String::new();
|
2020-12-10 23:13:24 +01:00
|
|
|
|
2020-05-03 17:24:51 +02:00
|
|
|
if let Ok(content) = resp {
|
2023-10-05 19:08:26 +02:00
|
|
|
// Extract the URL from the response in case redirects occurred (like @ gitlab.com)
|
2019-01-29 18:08:23 +01:00
|
|
|
let url = content.url().clone();
|
2019-10-05 14:48:15 +02:00
|
|
|
|
2020-12-10 23:13:24 +01:00
|
|
|
// Set the referer to be used on the final request, some sites check this.
|
2023-10-05 19:08:26 +02:00
|
|
|
// Mostly used to prevent direct linking and other security reasons.
|
2022-02-22 20:48:00 +01:00
|
|
|
referer = url.to_string();
|
2020-12-10 23:13:24 +01:00
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
// Add the fallback favicon.ico and apple-touch-icon.png to the list with the domain the content responded from.
|
2021-05-08 17:46:31 +02:00
|
|
|
iconlist.push(Icon::new(35, String::from(url.join("/favicon.ico").unwrap())));
|
2022-02-22 20:48:00 +01:00
|
|
|
iconlist.push(Icon::new(40, String::from(url.join("/apple-touch-icon.png").unwrap())));
|
2019-01-27 15:39:19 +01:00
|
|
|
|
2021-05-16 15:29:13 +02:00
|
|
|
// 384KB should be more than enough for the HTML, though as we only really need the HTML header.
|
2022-02-22 20:48:00 +01:00
|
|
|
let limited_reader = stream_to_bytes_limit(content, 384 * 1024).await?.to_vec();
|
2021-02-07 22:28:02 +01:00
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
let dom = Tokenizer::new_with_emitter(limited_reader.to_reader(), FaviconEmitter::default()).infallible();
|
2022-07-10 16:39:38 +02:00
|
|
|
get_favicons_node(dom, &mut iconlist, &url);
|
2019-01-27 15:39:19 +01:00
|
|
|
} else {
|
|
|
|
// Add the default favicon.ico to the list with just the given domain
|
2022-02-22 20:48:00 +01:00
|
|
|
iconlist.push(Icon::new(35, format!("{ssldomain}/favicon.ico")));
|
|
|
|
iconlist.push(Icon::new(40, format!("{ssldomain}/apple-touch-icon.png")));
|
|
|
|
iconlist.push(Icon::new(35, format!("{httpdomain}/favicon.ico")));
|
|
|
|
iconlist.push(Icon::new(40, format!("{httpdomain}/apple-touch-icon.png")));
|
2019-01-27 15:39:19 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the iconlist by priority
|
|
|
|
iconlist.sort_by_key(|x| x.priority);
|
|
|
|
|
|
|
|
// There always is an icon in the list, so no need to check if it exists, and just return the first one
|
2021-03-29 10:27:58 +02:00
|
|
|
Ok(IconUrlResult {
|
2020-12-10 23:13:24 +01:00
|
|
|
iconlist,
|
2021-03-29 10:27:58 +02:00
|
|
|
referer,
|
2020-12-10 23:13:24 +01:00
|
|
|
})
|
2019-01-27 15:39:19 +01:00
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn get_page(url: &str) -> Result<Response, Error> {
|
|
|
|
get_page_with_referer(url, "").await
|
2019-01-31 15:49:58 +01:00
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn get_page_with_referer(url: &str, referer: &str) -> Result<Response, Error> {
|
2023-02-07 11:24:23 +01:00
|
|
|
match check_domain_blacklist_reason(url::Url::parse(url).unwrap().host_str().unwrap_or_default()).await {
|
|
|
|
Some(DomainBlacklistReason::Regex) => warn!("Favicon '{}' is from a blacklisted domain!", url),
|
|
|
|
Some(DomainBlacklistReason::IP) => warn!("Favicon '{}' is hosted on a non-global IP!", url),
|
|
|
|
None => (),
|
2019-10-05 14:48:15 +02:00
|
|
|
}
|
2019-10-05 16:45:36 +02:00
|
|
|
|
2020-12-08 17:33:15 +01:00
|
|
|
let mut client = CLIENT.get(url);
|
2020-12-10 23:13:24 +01:00
|
|
|
if !referer.is_empty() {
|
|
|
|
client = client.header("Referer", referer)
|
2019-10-05 15:45:09 +02:00
|
|
|
}
|
2020-12-08 17:33:15 +01:00
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
match client.send().await {
|
2021-09-24 18:27:52 +02:00
|
|
|
Ok(c) => c.error_for_status().map_err(Into::into),
|
2022-12-29 14:11:52 +01:00
|
|
|
Err(e) => err_silent!(format!("{e}")),
|
2021-09-24 18:27:52 +02:00
|
|
|
}
|
2019-01-29 21:20:59 +01:00
|
|
|
}
|
|
|
|
|
2019-01-27 15:39:19 +01:00
|
|
|
/// Returns a Integer with the priority of the type of the icon which to prefer.
|
|
|
|
/// The lower the number the better.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
/// * `href` - A string which holds the href value or relative path.
|
|
|
|
/// * `sizes` - The size of the icon if available as a <width>x<height> value like 32x32.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
/// ```
|
2022-07-10 16:39:38 +02:00
|
|
|
/// priority1 = get_icon_priority("http://example.com/path/to/a/favicon.png", "32x32");
|
|
|
|
/// priority2 = get_icon_priority("https://example.com/path/to/a/favicon.ico", "");
|
2019-01-27 15:39:19 +01:00
|
|
|
/// ```
|
2022-07-10 16:39:38 +02:00
|
|
|
fn get_icon_priority(href: &str, sizes: &str) -> u8 {
|
2019-01-27 15:39:19 +01:00
|
|
|
// Check if there is a dimension set
|
2022-07-10 16:39:38 +02:00
|
|
|
let (width, height) = parse_sizes(sizes);
|
2019-01-27 15:39:19 +01:00
|
|
|
|
2019-02-04 12:55:39 +01:00
|
|
|
// Check if there is a size given
|
|
|
|
if width != 0 && height != 0 {
|
2019-01-27 15:39:19 +01:00
|
|
|
// Only allow square dimensions
|
|
|
|
if width == height {
|
|
|
|
// Change priority by given size
|
|
|
|
if width == 32 {
|
|
|
|
1
|
|
|
|
} else if width == 64 {
|
|
|
|
2
|
2021-05-16 15:29:13 +02:00
|
|
|
} else if (24..=192).contains(&width) {
|
2019-01-27 15:39:19 +01:00
|
|
|
3
|
|
|
|
} else if width == 16 {
|
|
|
|
4
|
|
|
|
} else {
|
2019-02-04 12:55:39 +01:00
|
|
|
5
|
2019-01-27 15:39:19 +01:00
|
|
|
}
|
2019-02-04 12:55:39 +01:00
|
|
|
// There are dimensions available, but the image is not a square
|
2019-01-27 15:39:19 +01:00
|
|
|
} else {
|
|
|
|
200
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Change priority by file extension
|
|
|
|
if href.ends_with(".png") {
|
|
|
|
10
|
|
|
|
} else if href.ends_with(".jpg") || href.ends_with(".jpeg") {
|
|
|
|
20
|
|
|
|
} else {
|
|
|
|
30
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-05 19:08:26 +02:00
|
|
|
/// Returns a Tuple with the width and height as a separate value extracted from the sizes attribute
|
2019-02-04 12:55:39 +01:00
|
|
|
/// It will return 0 for both values if no match has been found.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
/// * `sizes` - The size of the icon if available as a <width>x<height> value like 32x32.
|
2019-02-04 17:27:40 +01:00
|
|
|
///
|
2019-02-04 12:55:39 +01:00
|
|
|
/// # Example
|
|
|
|
/// ```
|
2022-07-10 16:39:38 +02:00
|
|
|
/// let (width, height) = parse_sizes("64x64"); // (64, 64)
|
|
|
|
/// let (width, height) = parse_sizes("x128x128"); // (128, 128)
|
|
|
|
/// let (width, height) = parse_sizes("32"); // (0, 0)
|
2019-02-04 12:55:39 +01:00
|
|
|
/// ```
|
2022-07-10 16:39:38 +02:00
|
|
|
fn parse_sizes(sizes: &str) -> (u16, u16) {
|
2019-02-04 12:55:39 +01:00
|
|
|
let mut width: u16 = 0;
|
|
|
|
let mut height: u16 = 0;
|
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
if !sizes.is_empty() {
|
2020-07-14 21:58:27 +02:00
|
|
|
match ICON_SIZE_REGEX.captures(sizes.trim()) {
|
2019-03-03 16:11:55 +01:00
|
|
|
None => {}
|
2019-02-04 12:55:39 +01:00
|
|
|
Some(dimensions) => {
|
|
|
|
if dimensions.len() >= 3 {
|
|
|
|
width = dimensions[1].parse::<u16>().unwrap_or_default();
|
|
|
|
height = dimensions[2].parse::<u16>().unwrap_or_default();
|
|
|
|
}
|
2019-03-03 16:11:55 +01:00
|
|
|
}
|
2019-02-04 12:55:39 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
(width, height)
|
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn download_icon(domain: &str) -> Result<(Bytes, Option<&str>), Error> {
|
2023-02-07 11:24:23 +01:00
|
|
|
match check_domain_blacklist_reason(domain).await {
|
|
|
|
Some(DomainBlacklistReason::Regex) => err_silent!("Domain is blacklisted", domain),
|
|
|
|
Some(DomainBlacklistReason::IP) => err_silent!("Host resolves to a non-global IP", domain),
|
|
|
|
None => (),
|
2019-10-10 23:21:22 +02:00
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
let icon_result = get_icon_url(domain).await?;
|
2018-06-12 21:09:42 +02:00
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
let mut buffer = Bytes::new();
|
2021-04-03 22:51:44 +02:00
|
|
|
let mut icon_type: Option<&str> = None;
|
2019-02-04 16:59:52 +01:00
|
|
|
|
2019-11-22 13:16:12 +01:00
|
|
|
use data_url::DataUrl;
|
|
|
|
|
2020-12-10 23:13:24 +01:00
|
|
|
for icon in icon_result.iconlist.iter().take(5) {
|
2019-11-22 13:16:12 +01:00
|
|
|
if icon.href.starts_with("data:image") {
|
2023-08-28 16:48:42 +02:00
|
|
|
let Ok(datauri) = DataUrl::process(&icon.href) else {
|
|
|
|
continue;
|
|
|
|
};
|
2019-11-22 13:16:12 +01:00
|
|
|
// Check if we are able to decode the data uri
|
2021-11-07 18:53:39 +01:00
|
|
|
let mut body = BytesMut::new();
|
|
|
|
match datauri.decode::<_, ()>(|bytes| {
|
|
|
|
body.extend_from_slice(bytes);
|
|
|
|
Ok(())
|
|
|
|
}) {
|
|
|
|
Ok(_) => {
|
2019-11-22 13:16:12 +01:00
|
|
|
// Also check if the size is atleast 67 bytes, which seems to be the smallest png i could create
|
|
|
|
if body.len() >= 67 {
|
2021-04-03 22:51:44 +02:00
|
|
|
// Check if the icon type is allowed, else try an icon from the list.
|
2022-07-10 16:39:38 +02:00
|
|
|
icon_type = get_icon_type(&body);
|
2021-04-03 22:51:44 +02:00
|
|
|
if icon_type.is_none() {
|
|
|
|
debug!("Icon from {} data:image uri, is not a valid image type", domain);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
info!("Extracted icon from data:image uri for {}", domain);
|
2021-11-07 18:53:39 +01:00
|
|
|
buffer = body.freeze();
|
2019-11-22 13:16:12 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-09-24 18:27:52 +02:00
|
|
|
_ => debug!("Extracted icon from data:image uri is invalid"),
|
2019-11-22 13:16:12 +01:00
|
|
|
};
|
|
|
|
} else {
|
2021-11-07 18:53:39 +01:00
|
|
|
match get_page_with_referer(&icon.href, &icon_result.referer).await {
|
|
|
|
Ok(res) => {
|
2022-02-22 20:48:00 +01:00
|
|
|
buffer = stream_to_bytes_limit(res, 5120 * 1024).await?; // 5120KB/5MB for each icon max (Same as icons.bitwarden.net)
|
|
|
|
|
2021-11-16 17:07:55 +01:00
|
|
|
// Check if the icon type is allowed, else try an icon from the list.
|
2022-07-10 16:39:38 +02:00
|
|
|
icon_type = get_icon_type(&buffer);
|
2021-04-03 22:51:44 +02:00
|
|
|
if icon_type.is_none() {
|
|
|
|
buffer.clear();
|
|
|
|
debug!("Icon from {}, is not a valid image type", icon.href);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
info!("Downloaded icon from {}", icon.href);
|
2019-11-22 13:16:12 +01:00
|
|
|
break;
|
2021-03-29 10:27:58 +02:00
|
|
|
}
|
2021-09-24 18:27:52 +02:00
|
|
|
Err(e) => debug!("{:?}", e),
|
2019-11-22 13:16:12 +01:00
|
|
|
};
|
|
|
|
}
|
2019-02-04 16:59:52 +01:00
|
|
|
}
|
2018-06-12 21:09:42 +02:00
|
|
|
|
2019-01-29 21:20:59 +01:00
|
|
|
if buffer.is_empty() {
|
2021-09-24 18:27:52 +02:00
|
|
|
err_silent!("Empty response or unable find a valid icon", domain);
|
2019-01-29 21:20:59 +01:00
|
|
|
}
|
|
|
|
|
2021-04-03 22:51:44 +02:00
|
|
|
Ok((buffer, icon_type))
|
2018-06-12 21:09:42 +02:00
|
|
|
}
|
|
|
|
|
2021-11-07 18:53:39 +01:00
|
|
|
async fn save_icon(path: &str, icon: &[u8]) {
|
|
|
|
match File::create(path).await {
|
2019-11-06 20:21:47 +01:00
|
|
|
Ok(mut f) => {
|
2021-11-07 18:53:39 +01:00
|
|
|
f.write_all(icon).await.expect("Error writing icon file");
|
2019-11-06 20:21:47 +01:00
|
|
|
}
|
|
|
|
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
|
2021-11-07 18:53:39 +01:00
|
|
|
create_dir_all(&CONFIG.icon_cache_folder()).await.expect("Error creating icon cache folder");
|
2019-11-06 20:21:47 +01:00
|
|
|
}
|
|
|
|
Err(e) => {
|
2021-12-24 18:24:25 +01:00
|
|
|
warn!("Unable to save icon: {:?}", e);
|
2019-11-06 20:21:47 +01:00
|
|
|
}
|
|
|
|
}
|
2018-02-10 01:00:55 +01:00
|
|
|
}
|
2021-04-03 22:51:44 +02:00
|
|
|
|
2022-07-10 16:39:38 +02:00
|
|
|
fn get_icon_type(bytes: &[u8]) -> Option<&'static str> {
|
2021-04-03 22:51:44 +02:00
|
|
|
match bytes {
|
|
|
|
[137, 80, 78, 71, ..] => Some("png"),
|
|
|
|
[0, 0, 1, 0, ..] => Some("x-icon"),
|
|
|
|
[82, 73, 70, 70, ..] => Some("webp"),
|
|
|
|
[255, 216, 255, ..] => Some("jpeg"),
|
2021-05-16 15:29:13 +02:00
|
|
|
[71, 73, 70, 56, ..] => Some("gif"),
|
2021-04-03 22:51:44 +02:00
|
|
|
[66, 77, ..] => Some("bmp"),
|
2021-04-06 22:55:28 +02:00
|
|
|
_ => None,
|
2021-04-03 22:51:44 +02:00
|
|
|
}
|
|
|
|
}
|
2021-05-16 15:29:13 +02:00
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
/// Minimize the amount of bytes to be parsed from a reqwest result.
|
|
|
|
/// This prevents very long parsing and memory usage.
|
|
|
|
async fn stream_to_bytes_limit(res: Response, max_size: usize) -> Result<Bytes, reqwest::Error> {
|
|
|
|
let mut stream = res.bytes_stream().take(max_size);
|
|
|
|
let mut buf = BytesMut::new();
|
|
|
|
let mut size = 0;
|
|
|
|
while let Some(chunk) = stream.next().await {
|
|
|
|
let chunk = &chunk?;
|
|
|
|
size += chunk.len();
|
|
|
|
buf.extend(chunk);
|
|
|
|
if size >= max_size {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(buf.freeze())
|
|
|
|
}
|
|
|
|
|
2021-05-16 15:29:13 +02:00
|
|
|
/// This is an implementation of the default Cookie Jar from Reqwest and reqwest_cookie_store build by pfernie.
|
|
|
|
/// The default cookie jar used by Reqwest keeps all the cookies based upon the Max-Age or Expires which could be a long time.
|
|
|
|
/// That could be used for tracking, to prevent this we force the lifespan of the cookies to always be max two minutes.
|
|
|
|
/// A Cookie Jar is needed because some sites force a redirect with cookies to verify if a request uses cookies or not.
|
|
|
|
use cookie_store::CookieStore;
|
|
|
|
#[derive(Default)]
|
2022-02-22 20:48:00 +01:00
|
|
|
pub struct Jar(std::sync::RwLock<CookieStore>);
|
2021-05-16 15:29:13 +02:00
|
|
|
|
|
|
|
impl reqwest::cookie::CookieStore for Jar {
|
|
|
|
fn set_cookies(&self, cookie_headers: &mut dyn Iterator<Item = &header::HeaderValue>, url: &url::Url) {
|
|
|
|
use cookie::{Cookie as RawCookie, ParseError as RawCookieParseError};
|
|
|
|
use time::Duration;
|
|
|
|
|
|
|
|
let mut cookie_store = self.0.write().unwrap();
|
|
|
|
let cookies = cookie_headers.filter_map(|val| {
|
|
|
|
std::str::from_utf8(val.as_bytes())
|
|
|
|
.map_err(RawCookieParseError::from)
|
|
|
|
.and_then(RawCookie::parse)
|
|
|
|
.map(|mut c| {
|
|
|
|
c.set_expires(None);
|
|
|
|
c.set_max_age(Some(Duration::minutes(2)));
|
|
|
|
c.into_owned()
|
|
|
|
})
|
|
|
|
.ok()
|
|
|
|
});
|
|
|
|
cookie_store.store_response_cookies(cookies, url);
|
|
|
|
}
|
|
|
|
|
|
|
|
fn cookies(&self, url: &url::Url) -> Option<header::HeaderValue> {
|
|
|
|
let cookie_store = self.0.read().unwrap();
|
|
|
|
let s = cookie_store
|
|
|
|
.get_request_values(url)
|
2022-12-29 14:11:52 +01:00
|
|
|
.map(|(name, value)| format!("{name}={value}"))
|
2021-05-16 15:29:13 +02:00
|
|
|
.collect::<Vec<_>>()
|
|
|
|
.join("; ");
|
|
|
|
|
|
|
|
if s.is_empty() {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
header::HeaderValue::from_maybe_shared(Bytes::from(s)).ok()
|
|
|
|
}
|
|
|
|
}
|
2021-11-07 18:53:39 +01:00
|
|
|
|
2022-02-22 20:48:00 +01:00
|
|
|
/// Custom FaviconEmitter for the html5gum parser.
|
2023-08-04 16:50:38 +02:00
|
|
|
/// The FaviconEmitter is using an optimized version of the DefaultEmitter.
|
2022-02-22 20:48:00 +01:00
|
|
|
/// This prevents emitting tags like comments, doctype and also strings between the tags.
|
2023-08-04 16:50:38 +02:00
|
|
|
/// But it will also only emit the tags we need and only if they have the correct attributes
|
2022-02-22 20:48:00 +01:00
|
|
|
/// Therefor parsing the HTML content is faster.
|
2023-08-04 16:50:38 +02:00
|
|
|
use std::collections::BTreeMap;
|
|
|
|
|
|
|
|
#[derive(Default)]
|
|
|
|
pub struct Tag {
|
|
|
|
/// The tag's name, such as `"link"` or `"base"`.
|
|
|
|
pub name: HtmlString,
|
2022-02-22 20:48:00 +01:00
|
|
|
|
2023-08-04 16:50:38 +02:00
|
|
|
/// A mapping for any HTML attributes this start tag may have.
|
|
|
|
///
|
|
|
|
/// Duplicate attributes are ignored after the first one as per WHATWG spec.
|
|
|
|
pub attributes: BTreeMap<HtmlString, HtmlString>,
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
2023-08-04 16:50:38 +02:00
|
|
|
struct FaviconToken {
|
|
|
|
tag: Tag,
|
|
|
|
closing: bool,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Default)]
|
2022-02-22 20:48:00 +01:00
|
|
|
struct FaviconEmitter {
|
|
|
|
current_token: Option<FaviconToken>,
|
2022-06-21 18:47:01 +02:00
|
|
|
last_start_tag: HtmlString,
|
|
|
|
current_attribute: Option<(HtmlString, HtmlString)>,
|
2023-08-04 16:50:38 +02:00
|
|
|
emit_token: bool,
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
impl FaviconEmitter {
|
2023-08-04 16:50:38 +02:00
|
|
|
fn flush_current_attribute(&mut self, emit_current_tag: bool) {
|
|
|
|
const ATTR_HREF: &[u8] = b"href";
|
|
|
|
const ATTR_REL: &[u8] = b"rel";
|
|
|
|
const TAG_LINK: &[u8] = b"link";
|
|
|
|
const TAG_BASE: &[u8] = b"base";
|
|
|
|
const TAG_HEAD: &[u8] = b"head";
|
|
|
|
|
|
|
|
if let Some(ref mut token) = self.current_token {
|
|
|
|
let tag_name: &[u8] = &token.tag.name;
|
|
|
|
|
|
|
|
if self.current_attribute.is_some() && (tag_name == TAG_BASE || tag_name == TAG_LINK) {
|
|
|
|
let (k, v) = self.current_attribute.take().unwrap();
|
|
|
|
token.tag.attributes.entry(k).and_modify(|_| {}).or_insert(v);
|
|
|
|
}
|
2022-02-22 20:48:00 +01:00
|
|
|
|
2023-08-04 16:50:38 +02:00
|
|
|
let tag_attr = &token.tag.attributes;
|
|
|
|
match tag_name {
|
|
|
|
TAG_HEAD if token.closing => self.emit_token = true,
|
|
|
|
TAG_BASE if tag_attr.contains_key(ATTR_HREF) => self.emit_token = true,
|
|
|
|
TAG_LINK if emit_current_tag && tag_attr.contains_key(ATTR_REL) && tag_attr.contains_key(ATTR_HREF) => {
|
|
|
|
let rel_value =
|
|
|
|
std::str::from_utf8(token.tag.attributes.get(ATTR_REL).unwrap()).unwrap_or_default();
|
|
|
|
if rel_value.contains("icon") && !rel_value.contains("mask-icon") {
|
|
|
|
self.emit_token = true
|
|
|
|
}
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
2023-08-04 16:50:38 +02:00
|
|
|
_ => (),
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Emitter for FaviconEmitter {
|
|
|
|
type Token = FaviconToken;
|
|
|
|
|
|
|
|
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
|
|
|
|
self.last_start_tag.clear();
|
|
|
|
self.last_start_tag.extend(last_start_tag.unwrap_or_default());
|
|
|
|
}
|
|
|
|
|
|
|
|
fn pop_token(&mut self) -> Option<Self::Token> {
|
2023-08-04 16:50:38 +02:00
|
|
|
if self.emit_token {
|
|
|
|
self.emit_token = false;
|
|
|
|
return self.current_token.take();
|
|
|
|
}
|
|
|
|
None
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn init_start_tag(&mut self) {
|
2023-08-04 16:50:38 +02:00
|
|
|
self.current_token = Some(FaviconToken {
|
|
|
|
tag: Tag::default(),
|
|
|
|
closing: false,
|
|
|
|
});
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn init_end_tag(&mut self) {
|
2023-08-04 16:50:38 +02:00
|
|
|
self.current_token = Some(FaviconToken {
|
|
|
|
tag: Tag::default(),
|
|
|
|
closing: true,
|
|
|
|
});
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
2022-06-21 18:47:01 +02:00
|
|
|
fn emit_current_tag(&mut self) -> Option<html5gum::State> {
|
2023-08-04 16:50:38 +02:00
|
|
|
self.flush_current_attribute(true);
|
|
|
|
self.last_start_tag.clear();
|
|
|
|
if self.current_token.is_some() && !self.current_token.as_ref().unwrap().closing {
|
|
|
|
self.last_start_tag.extend(&*self.current_token.as_ref().unwrap().tag.name);
|
2022-06-21 18:47:01 +02:00
|
|
|
}
|
2023-08-04 16:50:38 +02:00
|
|
|
html5gum::naive_next_state(&self.last_start_tag)
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn push_tag_name(&mut self, s: &[u8]) {
|
2023-08-04 16:50:38 +02:00
|
|
|
if let Some(ref mut token) = self.current_token {
|
|
|
|
token.tag.name.extend(s);
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn init_attribute(&mut self) {
|
2023-08-04 16:50:38 +02:00
|
|
|
self.flush_current_attribute(false);
|
|
|
|
self.current_attribute = match &self.current_token {
|
|
|
|
Some(token) => {
|
|
|
|
let tag_name: &[u8] = &token.tag.name;
|
|
|
|
match tag_name {
|
|
|
|
b"link" | b"head" | b"base" => Some(Default::default()),
|
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => None,
|
|
|
|
};
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn push_attribute_name(&mut self, s: &[u8]) {
|
2023-08-04 16:50:38 +02:00
|
|
|
if let Some(attr) = &mut self.current_attribute {
|
|
|
|
attr.0.extend(s)
|
|
|
|
}
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn push_attribute_value(&mut self, s: &[u8]) {
|
2023-08-04 16:50:38 +02:00
|
|
|
if let Some(attr) = &mut self.current_attribute {
|
|
|
|
attr.1.extend(s)
|
|
|
|
}
|
2022-02-22 20:48:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
|
2023-08-04 16:50:38 +02:00
|
|
|
match &self.current_token {
|
|
|
|
Some(token) if token.closing => !self.last_start_tag.is_empty() && self.last_start_tag == token.tag.name,
|
2022-02-22 20:48:00 +01:00
|
|
|
_ => false,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We do not want and need these parts of the HTML document
|
|
|
|
// These will be skipped and ignored during the tokenization and iteration.
|
|
|
|
fn emit_current_comment(&mut self) {}
|
|
|
|
fn emit_current_doctype(&mut self) {}
|
|
|
|
fn emit_eof(&mut self) {}
|
|
|
|
fn emit_error(&mut self, _: html5gum::Error) {}
|
|
|
|
fn emit_string(&mut self, _: &[u8]) {}
|
|
|
|
fn init_comment(&mut self) {}
|
|
|
|
fn init_doctype(&mut self) {}
|
|
|
|
fn push_comment(&mut self, _: &[u8]) {}
|
|
|
|
fn push_doctype_name(&mut self, _: &[u8]) {}
|
|
|
|
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
|
|
|
|
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
|
|
|
|
fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
|
|
|
|
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
|
|
|
|
fn set_force_quirks(&mut self) {}
|
|
|
|
fn set_self_closing(&mut self) {}
|
2021-11-07 18:53:39 +01:00
|
|
|
}
|