From ba2cd87ec40ef4ab5fd0b588950e231455637d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20BERTHO?= Date: Tue, 23 Jun 2020 18:29:55 +0200 Subject: [PATCH] Ajout du search parser amazon --- TODO.md | 5 ++-- src/main.rs | 25 +++++++++++++++-- src/parser.rs | 15 ++++++++--- src/parser/amazon.rs | 35 ++++++++++++++++++------ src/parser/darty.rs | 4 +-- src/parser/du_bruit_dans_la_cuisine.rs | 4 +-- src/parser/fnac.rs | 4 +-- src/parser/ldlc.rs | 4 +-- src/price_checker.rs | 37 +++++++++++++++++++------- 9 files changed, 99 insertions(+), 34 deletions(-) diff --git a/TODO.md b/TODO.md index f4775fd..88a3548 100644 --- a/TODO.md +++ b/TODO.md @@ -3,8 +3,9 @@ * [x] Récupération prix darty avec [scraper](https://crates.io/crates/scraper) et [reqwest](https://crates.io/crates/reqwest) * [x] Ajout du support de la Fnac, … * [x] Récupération URL ligne de commande avec [clap](https://crates.io/crates/clap) -* [ ] Ajout de SearchParser pour recherché un article sur tous les parseurs -* [ ] Ajout des pays avec [celes](https://crates.io/crates/celes) : recherche uniquement sur les parser du pays et parseur multi pays (amazon) +* [ ] Ajout de SearchParser pour rechercher un article sur tous les parseurs +* [ ] Commande de liste des différents parseurs +* [ ] Ajout des pays avec [celes](https://crates.io/crates/celes) : recherche uniquement sur les parser du pays et parseur multi pays (amazon par exemple) * [ ] Lecture des URLs depuis un fichier avec [toml](https://crates.io/crates/toml) * [ ] Parallélisation des requêtes avce [rayon](rayon) ou reqwest asynchrone * [ ] Écriture dans un fichier ODS avec [calamine](https://crates.io/crates/calamine) diff --git a/src/main.rs b/src/main.rs index 5ee0559..551a004 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,13 +24,34 @@ fn main() -> Result<()> { .required(true) .multiple(true) .help("The URL to get price"))) + .subcommand(SubCommand::with_name("search") + .about("Search an object") + .arg(Arg::with_name("name") + .required(true) + .multiple(true) + .help("The name of the object"))) .get_matches(); + let price_checker = PriceChecker::new().unwrap(); match matches.subcommand() { ("check", Some(check_matches)) => { - let price_checker = PriceChecker::new().unwrap(); for url_str in check_matches.values_of("URL").unwrap() { let url = Url::parse(url_str)?; - println!("{}", price_checker.get_price(url)?) ; + println!("{}", price_checker.get_price(&url)?) ; + } + }, + ("search", Some(check_matches)) => { + for name in check_matches.values_of("name").unwrap() { + let res = price_checker.search(name)? ; + if res.is_empty() { + println!("«{}» not found", name); + } else { + for (parser_name, url) in res { + let price = price_checker.get_price(&url)?; + println!(" - {}", parser_name) ; + println!(" * {}", url); + println!(" * {}", price); + } + } } }, _ => { diff --git a/src/parser.rs b/src/parser.rs index b077ea9..958b417 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,7 +14,7 @@ pub trait Parser{ /// Create the parser fn new() -> Result where Self : Sized; /// Get the name - fn name() -> &'static str where Self : Sized; + fn name(&self) -> &'static str; } /// Trait needed to get price from a specific website @@ -22,15 +22,21 @@ pub trait PriceParser : Parser{ /// Indicate if it can parse this URL fn can_parse(&self, url : &Url) -> bool; /// Parse the html into a price - fn parse(&self, html : &Html) -> Result; + fn parse_price(&self, html : &Html) -> Result; } -// @todo Macro générateur liste et tests +pub trait SearchParser : Parser { + /// Return the search URL + fn search_url(&self, name: &str) -> Url; + /// Return the first occurence of result of the page if any + fn search(&self, html : &Html) -> Result>; +} macro_rules! gen_list { ( $([$module:ident::$name:ident : $($array:ident),*]),* ) => { #[derive(Arraygen, Debug)] #[gen_array(pub fn get_price: & dyn PriceParser)] + #[gen_array(pub fn get_search: & dyn SearchParser)] pub struct List { $( $( @@ -60,11 +66,12 @@ gen_list!( [fnac::Fnac : get_price], [du_bruit_dans_la_cuisine::DuBruitDansLaCuisine : get_price], [ldlc::LDLC : get_price], - [amazon::Amazon : get_price] + [amazon::Amazon : get_price, get_search] ); #[test] fn test_parser_list() { let parser_list = List::new().unwrap(); assert_eq!(parser_list.get_price().len(), 5); + assert_eq!(parser_list.get_search().len(), 1); } \ No newline at end of file diff --git a/src/parser/amazon.rs b/src/parser/amazon.rs index 0b3880b..b61fac1 100644 --- a/src/parser/amazon.rs +++ b/src/parser/amazon.rs @@ -1,4 +1,4 @@ -use super::{Parser, PriceParser}; +use super::{Parser, PriceParser, SearchParser}; use crate::PriceResult; use scraper::{Selector, Html}; use url::Url; @@ -9,19 +9,23 @@ use anyhow::{Result, anyhow}; pub struct Amazon { price_selector: Selector, name_selector: Selector, - product_selector: Selector + product_selector: Selector, + search_selector_1: Selector, + search_selector_2: Selector } impl Parser for Amazon { fn new() -> Result { Ok(Amazon { - price_selector: Selector::parse(r"#priceblock_ourprice").unwrap(), + price_selector: Selector::parse(r".a-color-price").unwrap(), name_selector: Selector::parse(r"#productTitle").unwrap(), - product_selector: Selector::parse(r".nav-search-label").unwrap() + product_selector: Selector::parse(r".nav-search-label").unwrap(), + search_selector_1: Selector::parse(r".rush-component[data-component-type=s-product-image]").unwrap(), + search_selector_2: Selector::parse(r".a-link-normal").unwrap() }) } - fn name() -> &'static str { + fn name(&self) -> &'static str { "Amazon" } } @@ -32,7 +36,7 @@ impl PriceParser for Amazon { url.host_str().unwrap_or("") == "www.amazon.fr" } - fn parse(&self, html : &Html) -> Result { + fn parse_price(&self, html : &Html) -> Result { // Get price let price_element = html.select(&self.price_selector).next().ok_or(anyhow!("No price element"))?; let mut price_text_it = price_element.text(); @@ -54,10 +58,25 @@ impl PriceParser for Amazon { } } +impl SearchParser for Amazon { + fn search_url(&self, name: &str) -> Url { + Url::parse(& format!("https://www.amazon.fr/s?k={}", name)).unwrap() + } + + fn search(&self, html : &Html) -> Result> { + let search_element_1 = html.select(&self.search_selector_1).next().ok_or(anyhow!("No search element 1"))?; + let search_element_2 = search_element_1.select(&self.search_selector_2).next().ok_or(anyhow!("No search element 2"))?; + let path_url = search_element_2.value().attr("href").ok_or(anyhow!("No link element"))?; + let mut url = Url::parse("https://www.amazon.fr").unwrap(); + url.set_path(path_url.split("/ref").next().unwrap_or(path_url)); + Ok(Option::Some(url)) + } +} + #[test] fn test_parser_fnac() { let fnac_parser = Amazon::new().unwrap(); - assert!(fnac_parser.can_parse(&Url::parse("https://www.amazon.fr/AmazonBasics-Bo%C3%AEte-crayons-papier-pr%C3%A9taill%C3%A9s/dp/B071JM699B?ref_=ast_sto_dp").unwrap())); - assert!(fnac_parser.can_parse(&Url::parse("https://www.amazon.fr/AmazonBasics-Bo%C3%AEte-crayons-papier-pr%C3%A9taill%C3%A9s/dp/B071JM699B?ref_=ast_sto_dp").unwrap())); + assert!(fnac_parser.can_parse(&Url::parse("https://www.amazon.fr/AmazonBasics-Bo%C3%AEte-crayons-papier-pr%C3%A9taill%C3%A9s/dp/B071JM699B").unwrap())); + assert!(fnac_parser.can_parse(&Url::parse("https://www.amazon.fr/AmazonBasics-Bo%C3%AEte-crayons-papier-pr%C3%A9taill%C3%A9s/dp/B071JM699B").unwrap())); assert!(fnac_parser.can_parse(&Url::parse("https://www.amazon.com").unwrap()) == false); } \ No newline at end of file diff --git a/src/parser/darty.rs b/src/parser/darty.rs index 7358f9c..ab2b28f 100644 --- a/src/parser/darty.rs +++ b/src/parser/darty.rs @@ -21,7 +21,7 @@ impl Parser for Darty { }) } - fn name() -> &'static str { + fn name(&self) -> &'static str { "Darty" } } @@ -31,7 +31,7 @@ impl PriceParser for Darty { url.host_str().unwrap_or("") == "www.darty.com" } - fn parse(&self, html : &Html) -> Result { + fn parse_price(&self, html : &Html) -> Result { // Get price let price_element = html.select(&self.price_selector).next().ok_or(anyhow!("No price element"))?; let mut price_text_it = price_element.text(); diff --git a/src/parser/du_bruit_dans_la_cuisine.rs b/src/parser/du_bruit_dans_la_cuisine.rs index 9d98c68..332ed03 100644 --- a/src/parser/du_bruit_dans_la_cuisine.rs +++ b/src/parser/du_bruit_dans_la_cuisine.rs @@ -19,7 +19,7 @@ impl Parser for DuBruitDansLaCuisine { }) } - fn name() -> &'static str { + fn name(&self) -> &'static str { "Du bruit dans la Cuisine" } } @@ -29,7 +29,7 @@ impl PriceParser for DuBruitDansLaCuisine { url.host_str().unwrap_or("") == "www.dubruitdanslacuisine.fr" } - fn parse(&self, html : &Html) -> Result { + fn parse_price(&self, html : &Html) -> Result { // Get price let price_element = html.select(&self.price_selector).next().ok_or(anyhow!("No price element"))?; let mut price_text_it = price_element.text(); diff --git a/src/parser/fnac.rs b/src/parser/fnac.rs index dde42b1..5473c24 100644 --- a/src/parser/fnac.rs +++ b/src/parser/fnac.rs @@ -21,7 +21,7 @@ impl Parser for Fnac{ }) } - fn name() -> &'static str { + fn name(&self) -> &'static str { "Fnac" } } @@ -31,7 +31,7 @@ impl PriceParser for Fnac { url.host_str().unwrap_or("") == "www.fnac.com" } - fn parse(&self, html : &Html) -> Result { + fn parse_price(&self, html : &Html) -> Result { // Get price let price_element = html.select(&self.price_selector).next().ok_or(anyhow!("No price element"))?; let mut price_text_it = price_element.text(); diff --git a/src/parser/ldlc.rs b/src/parser/ldlc.rs index c03d411..bb2cefb 100644 --- a/src/parser/ldlc.rs +++ b/src/parser/ldlc.rs @@ -19,7 +19,7 @@ impl Parser for LDLC { }) } - fn name() -> &'static str { + fn name(&self) -> &'static str { "LDLC" } } @@ -29,7 +29,7 @@ impl PriceParser for LDLC { url.host_str().unwrap_or("") == "www.ldlc.com" } - fn parse(&self, html : &Html) -> Result { + fn parse_price(&self, html : &Html) -> Result { // Get price let price_element = html.select(&self.price_selector).nth(4).ok_or(anyhow!("No price element"))?; let mut price_text_it = price_element.text(); diff --git a/src/price_checker.rs b/src/price_checker.rs index 841e2e9..8251226 100644 --- a/src/price_checker.rs +++ b/src/price_checker.rs @@ -26,13 +26,30 @@ impl PriceChecker { }) } - /// Get a price from an URL - pub fn get_price(&self, url : Url) -> Result { - let parser = *self.parser_list.get_price().iter().find(|p| p.can_parse(&url)).ok_or(anyhow!("No parser can parse {}", url))?; + fn get_html(&self, url : &Url) -> Result { let response = self.client.get(url.clone()).send()?; let text = response.text()?; - let document = Html::parse_document(&text); - Ok(parser.parse(&document)?) + Ok(Html::parse_document(&text)) + } + + /// Get a price from an URL + pub fn get_price(&self, url : &Url) -> Result { + let parser = *self.parser_list.get_price().iter().find(|p| p.can_parse(&url)).ok_or(anyhow!("No parser can parse {}", url))?; + let document = self.get_html(url)?; + Ok(parser.parse_price(&document)?) + } + + /// Search an object in all parsers + pub fn search(&self, name: &str) -> Result> { + let mut urls = Vec::new(); + for parser in self.parser_list.get_search().iter() { + let search_url = parser.search_url(name); + let document = self.get_html(&search_url)?; + if let Some(url) = parser.search(&document)? { + urls.push((parser.name(), url)); + } + } + Ok(urls) } } @@ -41,29 +58,29 @@ fn test_price_checker() { let price_checker = PriceChecker::new().unwrap(); // Test darty - let price_result = price_checker.get_price(Url::parse("https://www.darty.com/nav/achat/gros_electromenager/refrigerateur-congelateur-refrigerateur-cong/refrigerateur-congelateur_bas/samsung_rb33n300nsa_ef.html").unwrap()).unwrap(); + let price_result = price_checker.get_price(&Url::parse("https://www.darty.com/nav/achat/gros_electromenager/refrigerateur-congelateur-refrigerateur-cong/refrigerateur-congelateur_bas/samsung_rb33n300nsa_ef.html").unwrap()).unwrap(); assert!(price_result.name != ""); assert!(price_result.price != 0.); assert!(price_result.product != ""); // Test fnac - let price_result = price_checker.get_price(Url::parse("https://www.fnac.com/a12584732/Kaamelott-Les-Six-Livres-L-integrale-de-la-serie-Coffret-Blu-ray-Alexandre-Astier-Blu-ray").unwrap()).unwrap(); + let price_result = price_checker.get_price(&Url::parse("https://www.fnac.com/a12584732/Kaamelott-Les-Six-Livres-L-integrale-de-la-serie-Coffret-Blu-ray-Alexandre-Astier-Blu-ray").unwrap()).unwrap(); assert!(price_result.name != ""); assert!(price_result.price != 0.); assert!(price_result.product != ""); // Test du bruis dans la cuisine - let price_result = price_checker.get_price(Url::parse("https://www.dubruitdanslacuisine.fr/tapis-a-patisserie-40-62-14377-p").unwrap()).unwrap(); + let price_result = price_checker.get_price(&Url::parse("https://www.dubruitdanslacuisine.fr/tapis-a-patisserie-40-62-14377-p").unwrap()).unwrap(); assert!(price_result.name != ""); assert!(price_result.price != 0.); // LDLC - let price_result = price_checker.get_price(Url::parse("https://www.ldlc.com/fiche/PB00335410.html").unwrap()).unwrap(); + let price_result = price_checker.get_price(&Url::parse("https://www.ldlc.com/fiche/PB00335410.html").unwrap()).unwrap(); assert!(price_result.name != ""); assert!(price_result.price != 0.); // Amazon - let price_result = price_checker.get_price(Url::parse("https://www.amazon.fr/AmazonBasics-Bo%C3%AEte-crayons-papier-pr%C3%A9taill%C3%A9s/dp/B071JM699B?ref_=ast_sto_dp&th=1").unwrap()).unwrap(); + let price_result = price_checker.get_price(&Url::parse("https://www.amazon.fr/AmazonBasics-Bo%C3%AEte-crayons-papier-pr%C3%A9taill%C3%A9s/dp/B071JM699B?ref_=ast_sto_dp&th=1").unwrap()).unwrap(); assert!(price_result.name != ""); assert!(price_result.price != 0.); assert!(price_result.product != "");