2020-07-14 18:00:09 +02:00
use std ::{
2020-12-08 17:33:15 +01:00
collections ::HashMap ,
2020-07-14 18:00:09 +02:00
fs ::{ create_dir_all , remove_file , symlink_metadata , File } ,
io ::prelude ::* ,
2020-07-14 23:44:01 +02:00
net ::{ IpAddr , ToSocketAddrs } ,
2020-12-08 17:33:15 +01:00
sync ::RwLock ,
2020-07-14 18:00:09 +02:00
time ::{ Duration , SystemTime } ,
} ;
2019-01-31 15:49:58 +01:00
2020-07-14 18:00:09 +02:00
use once_cell ::sync ::Lazy ;
2019-01-27 15:39:19 +01:00
use regex ::Regex ;
2020-12-10 23:13:24 +01:00
use reqwest ::{ blocking ::Client , blocking ::Response , header , Url } ;
2020-07-14 18:00:09 +02:00
use rocket ::{ http ::ContentType , http ::Cookie , response ::Content , Route } ;
2019-01-27 15:39:19 +01:00
2020-07-14 18:00:09 +02:00
use crate ::{ error ::Error , util ::Cached , CONFIG } ;
2019-01-27 15:39:19 +01:00
2018-02-10 01:00:55 +01:00
pub fn routes ( ) -> Vec < Route > {
routes! [ icon ]
}
2019-03-18 22:12:39 +01:00
const ALLOWED_CHARS : & str = " _-. " ;
2020-03-09 22:04:03 +01:00
static CLIENT : Lazy < Client > = Lazy ::new ( | | {
2020-12-10 23:13:24 +01:00
// Generate the default headers
let mut default_headers = header ::HeaderMap ::new ( ) ;
default_headers . insert ( header ::USER_AGENT , header ::HeaderValue ::from_static ( " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15 " ) ) ;
default_headers . insert ( header ::ACCEPT_LANGUAGE , header ::HeaderValue ::from_static ( " en-US,en;q=0.8 " ) ) ;
default_headers . insert ( header ::CACHE_CONTROL , header ::HeaderValue ::from_static ( " no-cache " ) ) ;
default_headers . insert ( header ::PRAGMA , header ::HeaderValue ::from_static ( " no-cache " ) ) ;
default_headers . insert ( header ::ACCEPT , header ::HeaderValue ::from_static ( " text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,image/apng,*/*;q=0.8 " ) ) ;
2019-01-29 21:20:59 +01:00
// Reuse the client between requests
2020-03-09 22:04:03 +01:00
Client ::builder ( )
2019-02-12 21:56:28 +01:00
. timeout ( Duration ::from_secs ( CONFIG . icon_download_timeout ( ) ) )
2020-12-10 23:13:24 +01:00
. default_headers ( default_headers )
2019-01-29 21:20:59 +01:00
. build ( )
2020-03-09 22:04:03 +01:00
. unwrap ( )
} ) ;
2019-01-29 21:20:59 +01:00
2020-12-08 17:33:15 +01:00
// Build Regex only once since this takes a lot of time.
static ICON_REL_REGEX : Lazy < Regex > = Lazy ::new ( | | Regex ::new ( r "(?i)icon$|apple.*icon" ) . unwrap ( ) ) ;
2020-07-14 21:58:27 +02:00
static ICON_SIZE_REGEX : Lazy < Regex > = Lazy ::new ( | | Regex ::new ( r "(?x)(\d+)\D*(\d+)" ) . unwrap ( ) ) ;
2020-12-08 17:33:15 +01:00
// Special HashMap which holds the user defined Regex to speedup matching the regex.
static ICON_BLACKLIST_REGEX : Lazy < RwLock < HashMap < String , Regex > > > = Lazy ::new ( | | RwLock ::new ( HashMap ::new ( ) ) ) ;
#[ get( " /<domain>/icon.png " ) ]
fn icon ( domain : String ) -> Option < Cached < Content < Vec < u8 > > > > {
if ! is_valid_domain ( & domain ) {
warn! ( " Invalid domain: {} " , domain ) ;
return None ;
}
2021-03-27 14:30:40 +01:00
get_icon ( & domain ) . map ( | ( icon , cached ) | {
let cache_ttl = if cached { CONFIG . icon_cache_ttl ( ) } else { CONFIG . icon_cache_negttl ( ) } ;
Cached ::ttl ( Content ( ContentType ::new ( " image " , " x-icon " ) , icon ) , cache_ttl )
} )
2020-12-08 17:33:15 +01:00
}
/// Returns if the domain provided is valid or not.
///
/// This does some manual checks and makes use of Url to do some basic checking.
/// domains can't be larger then 63 characters (not counting multiple subdomains) according to the RFC's, but we limit the total size to 255.
2019-03-18 22:12:39 +01:00
fn is_valid_domain ( domain : & str ) -> bool {
2020-12-08 17:33:15 +01:00
// If parsing the domain fails using Url, it will not work with reqwest.
if let Err ( parse_error ) = Url ::parse ( format! ( " https:// {} " , domain ) . as_str ( ) ) {
debug! ( " Domain parse error: '{}' - {:?} " , domain , parse_error ) ;
return false ;
} else if domain . is_empty ( )
| | domain . contains ( " .. " )
| | domain . starts_with ( '.' )
| | domain . starts_with ( '-' )
| | domain . ends_with ( '-' )
{
debug! ( " Domain validation error: '{}' is either empty, contains '..', starts with an '.', starts or ends with a '-' " , domain ) ;
return false ;
} else if domain . len ( ) > 255 {
debug! ( " Domain validation error: '{}' exceeds 255 characters " , domain ) ;
2019-03-18 22:12:39 +01:00
return false ;
}
for c in domain . chars ( ) {
if ! c . is_alphanumeric ( ) & & ! ALLOWED_CHARS . contains ( c ) {
2020-12-08 17:33:15 +01:00
debug! ( " Domain validation error: '{}' contains an invalid character '{}' " , domain , c ) ;
2019-03-18 22:12:39 +01:00
return false ;
}
}
true
}
2020-07-14 23:44:01 +02:00
/// TODO: This is extracted from IpAddr::is_global, which is unstable:
/// https://doc.rust-lang.org/nightly/std/net/enum.IpAddr.html#method.is_global
/// Remove once https://github.com/rust-lang/rust/issues/27709 is merged
2020-12-08 17:33:15 +01:00
#[ allow(clippy::nonminimal_bool) ]
2020-07-14 23:44:01 +02:00
#[ cfg(not(feature = " unstable " )) ]
fn is_global ( ip : IpAddr ) -> bool {
match ip {
IpAddr ::V4 ( ip ) = > {
// check if this address is 192.0.0.9 or 192.0.0.10. These addresses are the only two
// globally routable addresses in the 192.0.0.0/24 range.
if u32 ::from ( ip ) = = 0xc0000009 | | u32 ::from ( ip ) = = 0xc000000a {
return true ;
}
! ip . is_private ( )
& & ! ip . is_loopback ( )
& & ! ip . is_link_local ( )
& & ! ip . is_broadcast ( )
& & ! ip . is_documentation ( )
& & ! ( ip . octets ( ) [ 0 ] = = 100 & & ( ip . octets ( ) [ 1 ] & 0b1100_0000 = = 0b0100_0000 ) )
& & ! ( ip . octets ( ) [ 0 ] = = 192 & & ip . octets ( ) [ 1 ] = = 0 & & ip . octets ( ) [ 2 ] = = 0 )
& & ! ( ip . octets ( ) [ 0 ] & 240 = = 240 & & ! ip . is_broadcast ( ) )
& & ! ( ip . octets ( ) [ 0 ] = = 198 & & ( ip . octets ( ) [ 1 ] & 0xfe ) = = 18 )
// Make sure the address is not in 0.0.0.0/8
& & ip . octets ( ) [ 0 ] ! = 0
}
IpAddr ::V6 ( ip ) = > {
if ip . is_multicast ( ) & & ip . segments ( ) [ 0 ] & 0x000f = = 14 {
true
} else {
! ip . is_multicast ( )
& & ! ip . is_loopback ( )
& & ! ( ( ip . segments ( ) [ 0 ] & 0xffc0 ) = = 0xfe80 )
& & ! ( ( ip . segments ( ) [ 0 ] & 0xfe00 ) = = 0xfc00 )
& & ! ip . is_unspecified ( )
& & ! ( ( ip . segments ( ) [ 0 ] = = 0x2001 ) & & ( ip . segments ( ) [ 1 ] = = 0xdb8 ) )
}
}
}
}
#[ cfg(feature = " unstable " ) ]
fn is_global ( ip : IpAddr ) -> bool {
ip . is_global ( )
}
/// These are some tests to check that the implementations match
/// The IPv4 can be all checked in 5 mins or so and they are correct as of nightly 2020-07-11
/// The IPV6 can't be checked in a reasonable time, so we check about ten billion random ones, so far correct
/// Note that the is_global implementation is subject to change as new IP RFCs are created
///
/// To run while showing progress output:
/// cargo test --features sqlite,unstable -- --nocapture --ignored
#[ cfg(test) ]
#[ cfg(feature = " unstable " ) ]
mod tests {
use super ::* ;
#[ test ]
#[ ignore ]
fn test_ipv4_global ( ) {
for a in 0 .. u8 ::MAX {
println! ( " Iter: {} /255 " , a ) ;
for b in 0 .. u8 ::MAX {
for c in 0 .. u8 ::MAX {
for d in 0 .. u8 ::MAX {
let ip = IpAddr ::V4 ( std ::net ::Ipv4Addr ::new ( a , b , c , d ) ) ;
assert_eq! ( ip . is_global ( ) , is_global ( ip ) )
}
}
}
}
}
#[ test ]
#[ ignore ]
fn test_ipv6_global ( ) {
use ring ::rand ::{ SecureRandom , SystemRandom } ;
let mut v = [ 0 u8 ; 16 ] ;
let rand = SystemRandom ::new ( ) ;
for i in 0 .. 1_000 {
println! ( " Iter: {} /1_000 " , i ) ;
for _ in 0 .. 10_000_000 {
rand . fill ( & mut v ) . expect ( " Error generating random values " ) ;
let ip = IpAddr ::V6 ( std ::net ::Ipv6Addr ::new (
( v [ 14 ] as u16 ) < < 8 | v [ 15 ] as u16 ,
( v [ 12 ] as u16 ) < < 8 | v [ 13 ] as u16 ,
( v [ 10 ] as u16 ) < < 8 | v [ 11 ] as u16 ,
( v [ 8 ] as u16 ) < < 8 | v [ 9 ] as u16 ,
( v [ 6 ] as u16 ) < < 8 | v [ 7 ] as u16 ,
( v [ 4 ] as u16 ) < < 8 | v [ 5 ] as u16 ,
( v [ 2 ] as u16 ) < < 8 | v [ 3 ] as u16 ,
( v [ 0 ] as u16 ) < < 8 | v [ 1 ] as u16 ,
) ) ;
assert_eq! ( ip . is_global ( ) , is_global ( ip ) )
}
}
}
}
2020-12-08 17:33:15 +01:00
fn is_domain_blacklisted ( domain : & str ) -> bool {
2019-11-02 17:39:01 +01:00
let mut is_blacklisted = CONFIG . icon_blacklist_non_global_ips ( )
& & ( domain , 0 )
2019-10-05 14:48:15 +02:00
. to_socket_addrs ( )
. map ( | x | {
for ip_port in x {
2020-07-14 23:44:01 +02:00
if ! is_global ( ip_port . ip ( ) ) {
2019-10-05 14:48:15 +02:00
warn! ( " IP {} for domain '{}' is not a global IP! " , ip_port . ip ( ) , domain ) ;
return true ;
}
}
false
} )
. unwrap_or ( false ) ;
// Skip the regex check if the previous one is true already
if ! is_blacklisted {
if let Some ( blacklist ) = CONFIG . icon_blacklist_regex ( ) {
2020-12-08 17:33:15 +01:00
let mut regex_hashmap = ICON_BLACKLIST_REGEX . read ( ) . unwrap ( ) ;
// Use the pre-generate Regex stored in a Lazy HashMap if there's one, else generate it.
let regex = if let Some ( regex ) = regex_hashmap . get ( & blacklist ) {
regex
} else {
drop ( regex_hashmap ) ;
let mut regex_hashmap_write = ICON_BLACKLIST_REGEX . write ( ) . unwrap ( ) ;
// Clear the current list if the previous key doesn't exists.
// To prevent growing of the HashMap after someone has changed it via the admin interface.
if regex_hashmap_write . len ( ) > = 1 {
regex_hashmap_write . clear ( ) ;
}
// Generate the regex to store in too the Lazy Static HashMap.
let blacklist_regex = Regex ::new ( & blacklist ) . unwrap ( ) ;
regex_hashmap_write . insert ( blacklist . to_string ( ) , blacklist_regex ) ;
drop ( regex_hashmap_write ) ;
regex_hashmap = ICON_BLACKLIST_REGEX . read ( ) . unwrap ( ) ;
regex_hashmap . get ( & blacklist ) . unwrap ( )
} ;
// Use the pre-generate Regex stored in a Lazy HashMap.
2019-10-05 14:48:15 +02:00
if regex . is_match ( & domain ) {
warn! ( " Blacklisted domain: {:#?} matched {:#?} " , domain , blacklist ) ;
is_blacklisted = true ;
}
}
}
is_blacklisted
}
2021-03-27 14:30:40 +01:00
fn get_icon ( domain : & str ) -> Option < ( Vec < u8 > , bool ) > {
2019-01-25 18:23:51 +01:00
let path = format! ( " {} / {} .png " , CONFIG . icon_cache_folder ( ) , domain ) ;
2018-02-10 01:00:55 +01:00
2020-11-10 03:50:35 +01:00
// Check for expiration of negatively cached copy
if icon_is_negcached ( & path ) {
return None ;
}
2018-06-12 21:09:42 +02:00
if let Some ( icon ) = get_cached_icon ( & path ) {
2021-03-27 14:30:40 +01:00
return Some ( ( icon , true ) ) ;
2018-06-12 21:09:42 +02:00
}
2018-02-10 01:00:55 +01:00
2019-01-28 23:58:32 +01:00
if CONFIG . disable_icon_download ( ) {
2020-11-10 02:45:58 +01:00
return None ;
2019-01-28 23:58:32 +01:00
}
2020-11-10 02:45:58 +01:00
// Get the icon, or None in case of error
2019-01-27 16:03:18 +01:00
match download_icon ( & domain ) {
2018-06-12 21:09:42 +02:00
Ok ( icon ) = > {
save_icon ( & path , & icon ) ;
2021-03-27 14:30:40 +01:00
Some ( ( icon , false ) )
2018-12-19 00:57:45 +01:00
}
2018-07-01 15:27:42 +02:00
Err ( e ) = > {
2018-12-06 20:35:25 +01:00
error! ( " Error downloading icon: {:?} " , e ) ;
2019-11-06 20:34:52 +01:00
let miss_indicator = path + " .miss " ;
2020-12-08 17:33:15 +01:00
save_icon ( & miss_indicator , & [ ] ) ;
2020-11-10 02:45:58 +01:00
None
2018-07-01 15:27:42 +02:00
}
2018-06-12 21:09:42 +02:00
}
2018-02-10 01:00:55 +01:00
}
2018-06-12 21:09:42 +02:00
fn get_cached_icon ( path : & str ) -> Option < Vec < u8 > > {
2018-12-18 22:33:32 +01:00
// Check for expiration of successfully cached copy
if icon_is_expired ( path ) {
2018-12-19 00:57:45 +01:00
return None ;
2018-12-18 22:33:32 +01:00
}
2018-02-15 00:53:11 +01:00
// Try to read the cached icon, and return it if it exists
2018-06-11 15:44:37 +02:00
if let Ok ( mut f ) = File ::open ( path ) {
let mut buffer = Vec ::new ( ) ;
if f . read_to_end ( & mut buffer ) . is_ok ( ) {
2018-06-12 21:09:42 +02:00
return Some ( buffer ) ;
2018-02-10 01:00:55 +01:00
}
}
2018-06-12 21:09:42 +02:00
None
}
2019-01-20 15:36:33 +01:00
fn file_is_expired ( path : & str , ttl : u64 ) -> Result < bool , Error > {
2018-12-18 22:33:32 +01:00
let meta = symlink_metadata ( path ) ? ;
let modified = meta . modified ( ) ? ;
let age = SystemTime ::now ( ) . duration_since ( modified ) ? ;
Ok ( ttl > 0 & & ttl < = age . as_secs ( ) )
}
fn icon_is_negcached ( path : & str ) -> bool {
let miss_indicator = path . to_owned ( ) + " .miss " ;
2019-01-25 18:23:51 +01:00
let expired = file_is_expired ( & miss_indicator , CONFIG . icon_cache_negttl ( ) ) ;
2018-12-19 00:57:45 +01:00
2018-12-18 22:33:32 +01:00
match expired {
// No longer negatively cached, drop the marker
Ok ( true ) = > {
2018-12-19 00:57:45 +01:00
if let Err ( e ) = remove_file ( & miss_indicator ) {
error! ( " Could not remove negative cache indicator for icon {:?}: {:?} " , path , e ) ;
2018-12-18 22:33:32 +01:00
}
false
2018-12-19 00:57:45 +01:00
}
2018-12-18 22:33:32 +01:00
// The marker hasn't expired yet.
2018-12-19 00:57:45 +01:00
Ok ( false ) = > true ,
2018-12-18 22:33:32 +01:00
// The marker is missing or inaccessible in some way.
2018-12-19 00:57:45 +01:00
Err ( _ ) = > false ,
2018-12-18 22:33:32 +01:00
}
}
fn icon_is_expired ( path : & str ) -> bool {
2019-01-25 18:23:51 +01:00
let expired = file_is_expired ( path , CONFIG . icon_cache_ttl ( ) ) ;
2018-12-18 22:33:32 +01:00
expired . unwrap_or ( true )
}
2019-01-28 23:58:32 +01:00
#[ derive(Debug) ]
2019-03-18 22:12:39 +01:00
struct Icon {
2019-01-28 23:58:32 +01:00
priority : u8 ,
href : String ,
}
2019-03-18 22:12:39 +01:00
impl Icon {
2020-05-03 17:24:51 +02:00
const fn new ( priority : u8 , href : String ) -> Self {
2019-03-18 22:12:39 +01:00
Self { href , priority }
}
}
2021-02-07 22:28:02 +01:00
fn get_favicons_node ( node : & std ::rc ::Rc < markup5ever_rcdom ::Node > , icons : & mut Vec < Icon > , url : & Url ) {
if let markup5ever_rcdom ::NodeData ::Element { name , attrs , .. } = & node . data {
if name . local . as_ref ( ) = = " link " {
let mut has_rel = false ;
let mut href = None ;
let mut sizes = None ;
let attrs = attrs . borrow ( ) ;
for attr in attrs . iter ( ) {
let attr_name = attr . name . local . as_ref ( ) ;
let attr_value = attr . value . as_ref ( ) ;
if attr_name = = " rel " & & ICON_REL_REGEX . is_match ( attr_value ) {
has_rel = true ;
} else if attr_name = = " href " {
href = Some ( attr_value ) ;
} else if attr_name = = " sizes " {
sizes = Some ( attr_value ) ;
}
}
if has_rel & & href . is_some ( ) {
if let Ok ( full_href ) = url . join ( & href . unwrap ( ) ) . map ( | h | h . into_string ( ) ) {
let priority = get_icon_priority ( & full_href , sizes ) ;
icons . push ( Icon ::new ( priority , full_href ) ) ;
}
}
}
}
// TODO: Might want to limit the recursion depth?
for child in node . children . borrow ( ) . iter ( ) {
get_favicons_node ( child , icons , url ) ;
}
}
2020-12-10 23:13:24 +01:00
struct IconUrlResult {
iconlist : Vec < Icon > ,
cookies : String ,
referer : String ,
}
2019-02-04 16:59:52 +01:00
/// Returns a Result/Tuple which holds a Vector IconList and a string which holds the cookies from the last response.
/// There will always be a result with a string which will contain https://example.com/favicon.ico and an empty string for the cookies.
/// This does not mean that that location does exists, but it is the default location browser use.
2019-01-27 15:39:19 +01:00
///
/// # Argument
/// * `domain` - A string which holds the domain with extension.
///
/// # Example
/// ```
2019-02-04 16:59:52 +01:00
/// let (mut iconlist, cookie_str) = get_icon_url("github.com")?;
/// let (mut iconlist, cookie_str) = get_icon_url("gitlab.com")?;
2019-01-27 15:39:19 +01:00
/// ```
2020-12-10 23:13:24 +01:00
fn get_icon_url ( domain : & str ) -> Result < IconUrlResult , Error > {
2019-01-27 15:39:19 +01:00
// Default URL with secure and insecure schemes
let ssldomain = format! ( " https:// {} " , domain ) ;
let httpdomain = format! ( " http:// {} " , domain ) ;
2020-12-08 17:33:15 +01:00
// First check the domain as given during the request for both HTTPS and HTTP.
let resp = match get_page ( & ssldomain ) . or_else ( | _ | get_page ( & httpdomain ) ) {
Ok ( c ) = > Ok ( c ) ,
Err ( e ) = > {
let mut sub_resp = Err ( e ) ;
// When the domain is not an IP, and has more then one dot, remove all subdomains.
let is_ip = domain . parse ::< IpAddr > ( ) ;
if is_ip . is_err ( ) & & domain . matches ( '.' ) . count ( ) > 1 {
let mut domain_parts = domain . split ( '.' ) ;
let base_domain = format! (
" {base}.{tld} " ,
tld = domain_parts . next_back ( ) . unwrap ( ) ,
base = domain_parts . next_back ( ) . unwrap ( )
) ;
if is_valid_domain ( & base_domain ) {
let sslbase = format! ( " https:// {} " , base_domain ) ;
let httpbase = format! ( " http:// {} " , base_domain ) ;
debug! ( " [get_icon_url]: Trying without subdomains '{}' " , base_domain ) ;
sub_resp = get_page ( & sslbase ) . or_else ( | _ | get_page ( & httpbase ) ) ;
}
// When the domain is not an IP, and has less then 2 dots, try to add www. infront of it.
} else if is_ip . is_err ( ) & & domain . matches ( '.' ) . count ( ) < 2 {
let www_domain = format! ( " www. {} " , domain ) ;
if is_valid_domain ( & www_domain ) {
let sslwww = format! ( " https:// {} " , www_domain ) ;
let httpwww = format! ( " http:// {} " , www_domain ) ;
debug! ( " [get_icon_url]: Trying with www. prefix '{}' " , www_domain ) ;
sub_resp = get_page ( & sslwww ) . or_else ( | _ | get_page ( & httpwww ) ) ;
}
}
sub_resp
}
} ;
2020-12-10 23:13:24 +01:00
// Create the iconlist
let mut iconlist : Vec < Icon > = Vec ::new ( ) ;
// Create the cookie_str to fill it all the cookies from the response
// These cookies can be used to request/download the favicon image.
// Some sites have extra security in place with for example XSRF Tokens.
let mut cookie_str = " " . to_string ( ) ;
let mut referer = " " . to_string ( ) ;
2020-05-03 17:24:51 +02:00
if let Ok ( content ) = resp {
2019-01-28 23:58:32 +01:00
// Extract the URL from the respose in case redirects occured (like @ gitlab.com)
2019-01-29 18:08:23 +01:00
let url = content . url ( ) . clone ( ) ;
2019-10-05 14:48:15 +02:00
2020-12-08 17:33:15 +01:00
// Get all the cookies and pass it on to the next function.
// Needed for XSRF Cookies for example (like @ mijn.ing.nl)
2019-01-31 15:49:58 +01:00
let raw_cookies = content . headers ( ) . get_all ( " set-cookie " ) ;
2019-03-18 22:12:39 +01:00
cookie_str = raw_cookies
. iter ( )
2019-07-30 19:38:54 +02:00
. filter_map ( | raw_cookie | raw_cookie . to_str ( ) . ok ( ) )
. map ( | cookie_str | {
if let Ok ( cookie ) = Cookie ::parse ( cookie_str ) {
format! ( " {} = {} ; " , cookie . name ( ) , cookie . value ( ) )
} else {
String ::new ( )
}
2019-03-18 22:12:39 +01:00
} )
. collect ::< String > ( ) ;
2019-01-27 15:39:19 +01:00
2020-12-10 23:13:24 +01:00
// Set the referer to be used on the final request, some sites check this.
// Mostly used to prevent direct linking and other security resons.
referer = url . as_str ( ) . to_string ( ) ;
2019-01-27 15:39:19 +01:00
// Add the default favicon.ico to the list with the domain the content responded from.
2019-03-18 22:12:39 +01:00
iconlist . push ( Icon ::new ( 35 , url . join ( " /favicon.ico " ) . unwrap ( ) . into_string ( ) ) ) ;
2019-01-27 15:39:19 +01:00
2019-12-19 00:37:16 +01:00
// 512KB should be more than enough for the HTML, though as we only really need
// the HTML header, it could potentially be reduced even further
2021-02-07 22:28:02 +01:00
let mut limited_reader = content . take ( 512 * 1024 ) ;
use html5ever ::tendril ::TendrilSink ;
let dom = html5ever ::parse_document ( markup5ever_rcdom ::RcDom ::default ( ) , Default ::default ( ) )
. from_utf8 ( )
. read_from ( & mut limited_reader ) ? ;
2021-03-27 14:30:40 +01:00
2021-02-07 22:28:02 +01:00
get_favicons_node ( & dom . document , & mut iconlist , & url ) ;
2019-01-27 15:39:19 +01:00
} else {
// Add the default favicon.ico to the list with just the given domain
2019-03-18 22:12:39 +01:00
iconlist . push ( Icon ::new ( 35 , format! ( " {} /favicon.ico " , ssldomain ) ) ) ;
2019-10-29 14:24:01 +01:00
iconlist . push ( Icon ::new ( 35 , format! ( " {} /favicon.ico " , httpdomain ) ) ) ;
2019-01-27 15:39:19 +01:00
}
// Sort the iconlist by priority
iconlist . sort_by_key ( | x | x . priority ) ;
// There always is an icon in the list, so no need to check if it exists, and just return the first one
2020-12-10 23:13:24 +01:00
Ok ( IconUrlResult {
iconlist ,
cookies : cookie_str ,
referer
} )
2019-01-27 15:39:19 +01:00
}
2019-01-29 21:20:59 +01:00
fn get_page ( url : & str ) -> Result < Response , Error > {
2020-12-10 23:13:24 +01:00
get_page_with_cookies ( url , " " , " " )
2019-01-31 15:49:58 +01:00
}
2020-12-10 23:13:24 +01:00
fn get_page_with_cookies ( url : & str , cookie_str : & str , referer : & str ) -> Result < Response , Error > {
2020-12-08 17:33:15 +01:00
if is_domain_blacklisted ( Url ::parse ( url ) . unwrap ( ) . host_str ( ) . unwrap_or_default ( ) ) {
err! ( " Favicon rel linked to a blacklisted domain! " ) ;
2019-10-05 14:48:15 +02:00
}
2019-10-05 16:45:36 +02:00
2020-12-08 17:33:15 +01:00
let mut client = CLIENT . get ( url ) ;
if ! cookie_str . is_empty ( ) {
2020-12-10 23:13:24 +01:00
client = client . header ( " Cookie " , cookie_str )
}
if ! referer . is_empty ( ) {
client = client . header ( " Referer " , referer )
2019-10-05 15:45:09 +02:00
}
2020-12-08 17:33:15 +01:00
client . send ( ) ?
. error_for_status ( )
. map_err ( Into ::into )
2019-01-29 21:20:59 +01:00
}
2019-01-27 15:39:19 +01:00
/// Returns a Integer with the priority of the type of the icon which to prefer.
/// The lower the number the better.
///
/// # Arguments
/// * `href` - A string which holds the href value or relative path.
/// * `sizes` - The size of the icon if available as a <width>x<height> value like 32x32.
///
/// # Example
/// ```
/// priority1 = get_icon_priority("http://example.com/path/to/a/favicon.png", "32x32");
/// priority2 = get_icon_priority("https://example.com/path/to/a/favicon.ico", "");
/// ```
2021-02-07 22:28:02 +01:00
fn get_icon_priority ( href : & str , sizes : Option < & str > ) -> u8 {
2019-01-27 15:39:19 +01:00
// Check if there is a dimension set
2019-02-04 12:55:39 +01:00
let ( width , height ) = parse_sizes ( sizes ) ;
2019-01-27 15:39:19 +01:00
2019-02-04 12:55:39 +01:00
// Check if there is a size given
if width ! = 0 & & height ! = 0 {
2019-01-27 15:39:19 +01:00
// Only allow square dimensions
if width = = height {
// Change priority by given size
if width = = 32 {
1
} else if width = = 64 {
2
2020-12-10 23:13:24 +01:00
} else if ( 24 ..= 128 ) . contains ( & width ) {
2019-01-27 15:39:19 +01:00
3
} else if width = = 16 {
4
} else {
2019-02-04 12:55:39 +01:00
5
2019-01-27 15:39:19 +01:00
}
2019-02-04 12:55:39 +01:00
// There are dimensions available, but the image is not a square
2019-01-27 15:39:19 +01:00
} else {
200
}
} else {
// Change priority by file extension
if href . ends_with ( " .png " ) {
10
} else if href . ends_with ( " .jpg " ) | | href . ends_with ( " .jpeg " ) {
20
} else {
30
}
}
}
2019-02-04 12:55:39 +01:00
/// Returns a Tuple with the width and hight as a seperate value extracted from the sizes attribute
/// It will return 0 for both values if no match has been found.
///
/// # Arguments
/// * `sizes` - The size of the icon if available as a <width>x<height> value like 32x32.
2019-02-04 17:27:40 +01:00
///
2019-02-04 12:55:39 +01:00
/// # Example
/// ```
/// let (width, height) = parse_sizes("64x64"); // (64, 64)
/// let (width, height) = parse_sizes("x128x128"); // (128, 128)
/// let (width, height) = parse_sizes("32"); // (0, 0)
/// ```
2021-02-07 22:28:02 +01:00
fn parse_sizes ( sizes : Option < & str > ) -> ( u16 , u16 ) {
2019-02-04 12:55:39 +01:00
let mut width : u16 = 0 ;
let mut height : u16 = 0 ;
2019-03-18 22:12:39 +01:00
if let Some ( sizes ) = sizes {
2020-07-14 21:58:27 +02:00
match ICON_SIZE_REGEX . captures ( sizes . trim ( ) ) {
2019-03-03 16:11:55 +01:00
None = > { }
2019-02-04 12:55:39 +01:00
Some ( dimensions ) = > {
if dimensions . len ( ) > = 3 {
width = dimensions [ 1 ] . parse ::< u16 > ( ) . unwrap_or_default ( ) ;
height = dimensions [ 2 ] . parse ::< u16 > ( ) . unwrap_or_default ( ) ;
}
2019-03-03 16:11:55 +01:00
}
2019-02-04 12:55:39 +01:00
}
}
( width , height )
}
2019-01-27 16:03:18 +01:00
fn download_icon ( domain : & str ) -> Result < Vec < u8 > , Error > {
2020-12-08 17:33:15 +01:00
if is_domain_blacklisted ( domain ) {
2019-10-10 23:21:22 +02:00
err! ( " Domain is blacklisted " , domain )
}
2020-12-10 23:13:24 +01:00
let icon_result = get_icon_url ( & domain ) ? ;
2018-06-12 21:09:42 +02:00
2019-01-29 21:20:59 +01:00
let mut buffer = Vec ::new ( ) ;
2019-02-04 16:59:52 +01:00
2019-11-22 13:16:12 +01:00
use data_url ::DataUrl ;
2020-12-10 23:13:24 +01:00
for icon in icon_result . iconlist . iter ( ) . take ( 5 ) {
2019-11-22 13:16:12 +01:00
if icon . href . starts_with ( " data:image " ) {
let datauri = DataUrl ::process ( & icon . href ) . unwrap ( ) ;
// Check if we are able to decode the data uri
match datauri . decode_to_vec ( ) {
Ok ( ( body , _fragment ) ) = > {
// Also check if the size is atleast 67 bytes, which seems to be the smallest png i could create
if body . len ( ) > = 67 {
buffer = body ;
break ;
}
}
2019-12-27 18:37:14 +01:00
_ = > warn! ( " data uri is invalid " ) ,
2019-11-22 13:16:12 +01:00
} ;
} else {
2020-12-10 23:13:24 +01:00
match get_page_with_cookies ( & icon . href , & icon_result . cookies , & icon_result . referer ) {
2019-11-22 13:16:12 +01:00
Ok ( mut res ) = > {
info! ( " Downloaded icon from {} " , icon . href ) ;
res . copy_to ( & mut buffer ) ? ;
break ;
2020-12-10 23:13:24 +01:00
} ,
_ = > warn! ( " Download failed for {} " , icon . href ) ,
2019-11-22 13:16:12 +01:00
} ;
}
2019-02-04 16:59:52 +01:00
}
2018-06-12 21:09:42 +02:00
2019-01-29 21:20:59 +01:00
if buffer . is_empty ( ) {
err! ( " Empty response " )
}
2018-06-12 21:09:42 +02:00
Ok ( buffer )
}
fn save_icon ( path : & str , icon : & [ u8 ] ) {
2019-11-06 20:21:47 +01:00
match File ::create ( path ) {
Ok ( mut f ) = > {
f . write_all ( icon ) . expect ( " Error writing icon file " ) ;
}
Err ( ref e ) if e . kind ( ) = = std ::io ::ErrorKind ::NotFound = > {
create_dir_all ( & CONFIG . icon_cache_folder ( ) ) . expect ( " Error creating icon cache " ) ;
}
Err ( e ) = > {
info! ( " Icon save error: {:?} " , e ) ;
}
}
2018-02-10 01:00:55 +01:00
}