我对锈病很陌生,这是我在书中写的第一个图书馆。这是一款受原始Python库及其哈斯克尔港启发的文章日期抽取器。
它相当小,大部分逻辑都是从Python库中提取的。我想确定我的锈菌代码是惯用的。
这是extract_date.rs:
use regex::Regex;
use chrono::NaiveDate;
use reqwest;
use std::io::Read;
use select::document::Document;
use select::predicate::{Name, Attr};
use rustc_serialize::json::Json;
use errors::*;
// Some formats borrowed from https://github.com/amir/article-date-extractor
static FMTS: &'static [&str] = &["%A, %B %e, %Y",
"%Y-%m-%dT%H:%M:%S%:z",
"/%Y/%m/%d/",
"/%Y/%d/%m/",
"%Y-%m-%d",
"%B %e, %Y",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%SZ",
"%B %k, %Y, %H:%M %p",
"%Y-%m-%d %H:%M:%S.000000"];
// Use lazy_static to ensure we only compile the regex once
lazy_static! {
// Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
static ref RE: Regex =
Regex::new(r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})").unwrap();
}
// Parse the date, trying out each format
fn parse_date(input: &str) -> Result<NaiveDate> {
let mut result: Result<NaiveDate> = Err("None of the formats matched the date".into());
'outer: for fmt in FMTS {
if let Ok(v) = NaiveDate::parse_from_str(input, fmt) {
{
result = Ok(v);
break 'outer;
}
}
}
result
}
// Extract date from a URL
fn extract_from_url(url: &str) -> Option<String> {
if let Some(val) = RE.find(url) {
return Some(val.as_str().to_string());
} else {
return None;
}
}
// Extract date from JSON-LD
fn extract_from_ldjson<'a>(html: &'a Document) -> Option<String> {
let mut json_date: Option<String> = None;
let mut _ldjson: String = String::new();
if let Some(ldj) = html.find(Attr("type", "application/ld+json")).next() {
_ldjson = ldj.text();
} else {
return None;
}
let mut _decoded_ldjson: Json = Json::from_str("{}").unwrap();
match Json::from_str(&_ldjson) {
Ok(v) => _decoded_ldjson = v,
_ => return None,
}
if let Some(date_published) = _decoded_ldjson.search("datePublished") {
if let Some(date) = date_published.as_string() {
json_date = Some(date.to_string())
}
} else if let Some(date_created) = _decoded_ldjson.search("dateCreated") {
if let Some(date) = date_created.as_string() {
json_date = Some(date.to_string())
}
}
json_date
}
// Extract date from meta tags
fn extract_from_meta<'a>(html: &'a Document) -> Option<String> {
let mut meta_date: Option<String> = None;
'outer: for meta in html.find(Name("meta")) {
let meta_name: Option<&str> = meta.attr("name");
let item_prop: Option<&str> = meta.attr("itemprop");
let http_equiv: Option<&str> = meta.attr("http-equiv");
let meta_property: Option<&str> = meta.attr("property");
if let Some(v) = meta_name {
match v.to_lowercase().as_ref() {
"pubdate" |
"publishdate" |
"timestamp" |
"dc.date.issued" |
"date" |
"sailthru.date" |
"article.published" |
"published-date" |
"article.created" |
"article_date_original" |
"cxenseparse:recs:publishtime" |
"date_published" => {
if let Some(ct) = meta.attr("content") {
{
meta_date = Some(ct.trim().to_string());
break 'outer;
}
}
}
_ => {}
}
}
if let Some(v) = item_prop {
match v.to_lowercase().as_ref() {
"datepublished" | "datecreated" => {
if let Some(ct) = meta.attr("content") {
{
meta_date = Some(ct.trim().to_string());
break 'outer;
}
}
}
_ => {}
}
}
if let Some(v) = http_equiv {
match v.to_lowercase().as_ref() {
"date" => {
if let Some(ct) = meta.attr("content") {
{
meta_date = Some(ct.trim().to_string());
break 'outer;
}
}
}
_ => {}
}
}
if let Some(v) = meta_property {
match v.as_ref() {
"article:published_time" |
"bt:pubdate" => {
if let Some(ct) = meta.attr("content") {
{
meta_date = Some(ct.trim().to_string());
break 'outer;
}
}
}
"og:image" => {
if let Some(url) = meta.attr("content") {
{
meta_date = extract_from_url(url.trim());
break 'outer;
}
}
}
_ => {}
}
}
}
meta_date
}
// Extract from html tags
fn extract_from_html_tag<'a>(html: &'a Document) -> Option<String> {
lazy_static! {
static ref TAG_RE: Regex =
Regex::new(r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date").unwrap();
}
let mut date: Option<String> = None;
'initial: for time in html.find(Name("time")) {
if let Some(dt) = time.attr("datetime") {
{
date = Some(dt.to_string());
break 'initial;
}
} else if let Some("timestamp") = time.attr("class") {
{
date = Some(time.text().trim_matches('\n').to_string());
break 'initial;
}
}
}
if date.is_none() {
'outer: for tag in html.find(Name("span")) {
if let Some("datePublished") = tag.attr("itemprop") {
if let Some(v) = tag.attr("content") {
{
date = Some(v.to_string());
break 'outer;
}
} else if !tag.text().is_empty() {
{
date = Some(tag.text().trim_matches('\n').to_string());
break 'outer;
}
}
}
}
}
// These next three loops are due to the lack of `find_all` method for select.rs library
if date.is_none() {
'outer_first: for tag in html.find(Name("span")) {
if TAG_RE.is_match(tag.attr("class").unwrap_or("")) {
{
date = Some(tag.text().trim_matches('\n').to_string());
break 'outer_first;
}
}
}
}
if date.is_none() {
'outer_second: for tag in html.find(Name("p")) {
if TAG_RE.is_match(tag.attr("class").unwrap_or("")) {
{
date = Some(tag.text().trim_matches('\n').to_string());
break 'outer_second;
}
}
}
}
if date.is_none() {
'outer_third: for tag in html.find(Name("div")) {
if TAG_RE.is_match(tag.attr("class").unwrap_or("")) {
{
date = Some(tag.text().trim_matches('\n').to_string());
break 'outer_third;
}
}
}
}
date
}
// Try to extract the date by using each function one by one
/// This function attempts to extract the article date by using several different methods in a row.
/// The following methods are used: extracting the date from url, JSON-LD, meta tags, additional html tags.
///
/// Supported date formats:
///
///
///"%A, %B %e, %Y"
///
///"%Y-%m-%dT%H:%M:%S%:z"
///
///"/%Y/%m/%d/"
///
///"/%Y/%d/%m/"
///
///"%Y-%m-%d"
///
///"%B %e, %Y"
///
///"%Y-%m-%d %H:%M:%S"
///
///"%Y-%m-%dT%H:%M:%SZ"
///
///"%B %k, %Y, %H:%M %p"
///
///"%Y-%m-%d %H:%M:%S.000000"
///
pub fn extract_article_published_date(link: &str, html: Option<String>) -> Result<NaiveDate> {
let mut body: String = String::new();
let mut _parsed_body: Option<Document> = None;
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
}
if html.is_none() {
if let Ok(mut response) = reqwest::get(link) {
response.read_to_string(&mut body).unwrap();
let doc = Document::from(body.as_str());
_parsed_body = Some(doc);
} else {
return Err("Couldn't open the link".into());
}
} else {
_parsed_body = Some(Document::from(html.unwrap().as_str()))
}
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
} else if let Some(v) = extract_from_ldjson(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_meta(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_html_tag(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else {
return Err("Couldn't find the date to parse".into());
}
}
// Unit tests
#[cfg(test)]
mod test {
use super::extract_from_url;
use super::parse_date;
use super::extract_from_meta;
use super::extract_from_ldjson;
use super::extract_from_html_tag;
use chrono::NaiveDate;
use reqwest;
use reqwest::Response;
use std::io::Read;
use select::document::Document;
#[test]
fn parsing_date() {
assert_eq!(NaiveDate::from_ymd(2015, 11, 30),
parse_date("/2015/11/30/").unwrap());
assert_eq!(NaiveDate::from_ymd(2015, 11, 30),
parse_date("/2015/30/11/").unwrap());
assert!(parse_date("bad_format").is_err());
}
#[test]
fn extracting_from_url() {
let link: &str = "http://edition.cnn.\
com/2015/11/28/opinions/sutter-cop21-paris-preview-two-degrees/index.\
html";
assert_eq!(Some("/2015/11/28/".to_string()), extract_from_url(link));
let link: &str = "";
assert_eq!(None, extract_from_url(link));
}
#[test]
fn extracting_from_ldjson() {
let mut response: Response =
reqwest::get("https://techcrunch.com/2015/11/30/atlassian-share-price/").unwrap();
let mut body: String = String::new();
response.read_to_string(&mut body).unwrap();
let document: Document = Document::from(body.as_str());
assert_eq!(Some("2015-12-01T07:50:48Z".to_string()),
extract_from_ldjson(&document));
}
#[test]
fn extracting_from_meta() {
let mut response: Response =
reqwest::get("https://techcrunch.com/2015/11/30/atlassian-share-price/").unwrap();
let mut body: String = String::new();
response.read_to_string(&mut body).unwrap();
let document: Document = Document::from(body.as_str());
assert_eq!(Some(("2015-11-30 23:50:48".to_string())),
extract_from_meta(&document));
}
#[test]
fn extracting_from_html_tag() {
let mut response: Response =
reqwest::get("https://research.googleblog.\
com/2017/03/announcing-guetzli-new-open-source-jpeg.html")
.unwrap();
let mut body: String = String::new();
response.read_to_string(&mut body).unwrap();
let document: Document = Document::from(body.as_str());
assert_eq!(Some("Thursday, March 16, 2017".to_string()),
extract_from_html_tag(&document));
}
}errors.rs:
error_chain! {}lib.rs:
/*!
This crate provides a library for extracting the publication date from
an article or a blog plost. It was heavily influenced by both the original
[article-date-extractor](https://github.com/Webhose/article-date-extractor)
written in Python, as well as its [Haskell port](https://github.com/amir/article-date-extractor).
# Example: extracting a date from a news article
```rust使用article_date_extractor::extract_date::extract_article_published_date;
设link = "http://edition.cnn.com/2015/11/28/opinions/sutter-cop21-paris-preview-two-degrees/index.html“;
断言!(extract_article_published_date(&link,None).is_ok());
*/
#![recursion_limit = "1024"]
extern crate regex;
#[macro_use]
extern crate lazy_static;
extern crate chrono;
extern crate reqwest;
extern crate select;
extern crate rustc_serialize;
#[macro_use]
extern crate error_chain;
pub mod extract_date;
mod errors;Cargo.toml依赖关系:
[dependencies]
regex = "0.2"
lazy_static = "0.2.2"
chrono = "0.3"
reqwest = "0.4.0"
select = { git = "https://github.com/utkarshkukreti/select.rs" }
rustc-serialize = "0.3"
error-chain = "0.10.0"测试/集成测试。:
extern crate article_date_extractor;
extern crate chrono;
extern crate reqwest;
#[test]
fn integration_test() {
use article_date_extractor::extract_date::extract_article_published_date;
use chrono::NaiveDate;
use reqwest;
use std::io::Read;
let link_1 = "http://edition.cnn.\
com/2015/11/28/opinions/sutter-cop21-paris-preview-two-degrees/index.html";
let link_2 = "https://www.nytimes.\
com/2017/03/15/style/meditation-studio-sound-baths-mndfl-new-york.html";
let link_3 = "http://www.bbc.com/news/world-middle-east-39298218";
let link_4 = "https://research.googleblog.com/2017/03/announcing-guetzli-new-open-source-jpeg.\
html";
let link_5 = "http://theklog.co/type-of-water-to-wash-face-with/";
let mut response =
reqwest::get("http://edition.cnn.\
com/2015/11/28/opinions/sutter-cop21-paris-preview-two-degrees/index.html")
.unwrap();
let mut body = String::new();
response.read_to_string(&mut body).unwrap();
assert_eq!(NaiveDate::from_ymd(2015, 11, 28),
extract_article_published_date(&link_1, None).unwrap());
assert_eq!(NaiveDate::from_ymd(2015, 11, 28),
extract_article_published_date(&link_1, Some(body)).unwrap());
assert_eq!(NaiveDate::from_ymd(2017, 03, 15),
extract_article_published_date(&link_2, None).unwrap());
assert_eq!(NaiveDate::from_ymd(2017, 03, 16),
extract_article_published_date(&link_3, None).unwrap());
assert_eq!(NaiveDate::from_ymd(2017, 03, 16),
extract_article_published_date(&link_4, None).unwrap());
assert_eq!(NaiveDate::from_ymd(2017, 03, 16),
extract_article_published_date(&link_5, None).unwrap());
assert!((extract_article_published_date("", None)).is_err());
}发布于 2017-03-24 00:54:48
除了另一个答案中提到的一切..。
'static来避免在static中使用它,那么请充分使用它。只要说&[&str]parse_date中遍历格式,不如使用Iterator和Option适配器。Option和Iterator上的所有方法。他们会帮你节省很多时间。map是无价的。见extract_from_url。// Extract date from a URL就在extract_from_url方法之前。这对你有什么好处?如果您对函数有什么有用的信息,请使用doc注释。extract_from_ldjson和其他许多内容。_ldjson,_decoded_ldjson)。Option::and_then,Option::map,Option::or等。if let语句中。meta.attr("content")的if let在每个子句中都会重复,但是如果它是None,我们应该立即失败。把它拉出来,早点检查。static FMTS: &[&str] = &["%A, %B %e, %Y",
"%Y-%m-%dT%H:%M:%S%:z",
"/%Y/%m/%d/",
"/%Y/%d/%m/",
"%Y-%m-%d",
"%B %e, %Y",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%SZ",
"%B %k, %Y, %H:%M %p",
"%Y-%m-%d %H:%M:%S.000000"];
fn parse_date(input: &str) -> Result<NaiveDate> {
FMTS.iter()
.flat_map(|fmt| NaiveDate::parse_from_str(input, fmt))
.next()
.ok_or("None of the formats matched the date".into())
}
fn extract_from_url(url: &str) -> Option<String> {
RE.find(url).map(|val| val.as_str().to_string())
}
fn extract_from_ldjson(html: &Document) -> Option<String> {
html.find(Attr("type", "application/ld+json"))
.next()
.map(|ldj| ldj.text())
.and_then(|ldjson| Json::from_str(&ldjson).ok())
.and_then(|_decoded_ldjson| {
let published = _decoded_ldjson
.search("datePublished")
.and_then(|date| date.as_string())
.map(|date| date.to_string());
let created = _decoded_ldjson
.search("dateCreated")
.and_then(|date| date.as_string())
.map(|date| date.to_string());
published.or(created)
})
}
fn meta_name_denotes_date(meta_name: &str) -> bool {
match meta_name.to_lowercase().as_str() {
"pubdate" |
"publishdate" |
"timestamp" |
"dc.date.issued" |
"date" |
"sailthru.date" |
"article.published" |
"published-date" |
"article.created" |
"article_date_original" |
"cxenseparse:recs:publishtime" |
"date_published" => true,
_ => false,
}
}
fn meta_itemprop_denotes_date(item_prop: &str) -> bool {
match item_prop.to_lowercase().as_str() {
"datepublished" | "datecreated" => true,
_ => false
}
}
fn meta_http_equiv_denotes_date(http_equiv: &str) -> bool {
match http_equiv.to_lowercase().as_str() {
"date" => true,
_ => false,
}
}
fn meta_property_denotes_date(meta_property: &str) -> bool {
match meta_property {
"article:published_time" | "bt:pubdate" => true,
_ => false,
}
}
fn extract_from_meta(html: &Document) -> Option<String> {
html.find(Name("meta")).flat_map(|meta| {
let content = match meta.attr("content") {
Some(c) => c,
None => return None,
};
let content = content.trim();
let meta_name = meta.attr("name");
let item_prop = meta.attr("itemprop");
let http_equiv = meta.attr("http-equiv");
let meta_property = meta.attr("property");
let content_has_date = meta_name.map(meta_name_denotes_date)
.or_else(|| item_prop.map(meta_itemprop_denotes_date))
.or_else(|| http_equiv.map(meta_http_equiv_denotes_date))
.or_else(|| meta_property.map(meta_property_denotes_date))
.unwrap_or(false);
if content_has_date {
Some(content.to_string())
} else if Some("og:image") == meta_property {
extract_from_url(content)
} else {
None
}
}).next()
}在这一点上,我厌倦了一遍又一遍地修复相同类型的错误,因此希望在剩下的代码^_^中没有什么值得评论的地方了。
发布于 2017-03-23 18:42:08
首先,没有必要在任何地方指定类型。Rust编译器支持类型推断,因此
let mut _ldjson: String = String::new();是一样的
let mut _ldjson = String::new();在可能的情况下使用类型推断。干的。顺便说一句,我建议避免以下划线开头的名称:可读性受损。
几乎所有的东西都是一种表达。更喜欢表达式而不是语句。它允许简化代码并避免不必要的初始化。
let mut _decoded_ldjson: Json = Json::from_str("{}").unwrap();
match Json::from_str(&_ldjson) {
Ok(v) => _decoded_ldjson = v,
_ => return None,
}vs
let decoded_ldjson = match Json::from_str(&ldjson) {
Ok(v) => v,
_ => return None;
};这种方法还允许使更多的值不可变,这在大多数情况下会导致更有效的代码。
锈蚀标准库提供了函数,可以避免代码中不必要的分支。让我们将这个技巧与前面的技巧结合起来,看看会发生什么:
pub fn extract_article_published_date(link: &str, html: Option<String>) -> Result<NaiveDate> {
let mut body: String = String::new();
let mut _parsed_body: Option<Document> = None;
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
}
if html.is_none() {
if let Ok(mut response) = reqwest::get(link) {
response.read_to_string(&mut body).unwrap();
let doc = Document::from(body.as_str());
_parsed_body = Some(doc);
} else {
return Err("Couldn't open the link".into());
}
} else {
_parsed_body = Some(Document::from(html.unwrap().as_str()))
}
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
} else if let Some(v) = extract_from_ldjson(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_meta(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_html_tag(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else {
return Err("Couldn't find the date to parse".into());
}
}=>
pub fn extract_article_published_date(link: &str, html: &str) -> Result<NaiveDate> {
let doc = Document::from(html);
extract_from_url(link)
.or_else(|| extract_from_ldjson(doc.as_ref()))
.or_else(|| extract_from_meta(doc.as_ref()))
.or_else(|| extract_from_html_tag(doc.as_ref()))
.ok_or("Couldn't find the date to parse".into())
.map(|v| parse_date(&v))
}这就是惯用的锈菌代码!
我还删除了下载网页的代码,因为没有必要下载它。这样做可以将库从网络代码中解放出来,并允许用户将自己喜爱的库用于http请求(例如超高压)。
https://codereview.stackexchange.com/questions/158516
复制相似问题