1use serde_json::json;
34use std::borrow::Cow;
35use std::collections::HashMap;
36
37use crate::error::{HtmlError, Result, SeoErrorKind};
38use once_cell::sync::Lazy;
39use regex::{Captures, Regex};
40use scraper::{Html, Selector};
41
42const MAX_HTML_SIZE: usize = 1_000_000;
45const DEFAULT_PAGE_TYPE: &str = "WebPage";
47const SCHEMA_ORG_CONTEXT: &str = "https://schema.org";
49const DEFAULT_OG_TYPE: &str = "website";
51
52static HTML_ESCAPES: Lazy<Regex> = Lazy::new(|| {
54 Regex::new(r#"[&<>"']"#).expect("static HTML_ESCAPES must compile")
55});
56
57static META_DESC_SELECTOR: Lazy<Selector> = Lazy::new(|| {
59 Selector::parse("meta[name='description']")
60 .expect("static META_DESC_SELECTOR must parse")
61});
62
63static TITLE_SELECTOR: Lazy<Selector> = Lazy::new(|| {
65 Selector::parse("title").expect("static TITLE_SELECTOR must parse")
66});
67
68static PARAGRAPH_SELECTOR: Lazy<Selector> = Lazy::new(|| {
70 Selector::parse("p").expect("static PARAGRAPH_SELECTOR must parse")
71});
72
73#[derive(Debug, Clone)]
88pub struct StructuredDataConfig {
89 pub additional_data: Option<HashMap<String, String>>,
91 pub page_type: String,
93 pub additional_types: Vec<String>,
95}
96
97impl Default for StructuredDataConfig {
98 fn default() -> Self {
99 Self {
100 additional_data: None,
101 page_type: String::from(DEFAULT_PAGE_TYPE),
102 additional_types: Vec::new(),
103 }
104 }
105}
106
107impl StructuredDataConfig {
108 fn validate(&self) -> Result<()> {
116 validate_page_type(&self.page_type)?;
117
118 if self.additional_types.iter().any(String::is_empty) {
119 return Err(HtmlError::seo(
120 SeoErrorKind::InvalidStructuredData,
121 "Additional types cannot be empty",
122 None,
123 ));
124 }
125 Ok(())
126 }
127}
128
129#[derive(Debug, Default)]
145pub struct MetaTagsBuilder {
146 title: Option<String>,
148 description: Option<String>,
150 og_type: String,
152 additional_tags: Vec<(String, String)>,
154}
155
156impl MetaTagsBuilder {
157 #[must_use]
167 pub fn new() -> Self {
168 Self {
169 title: None,
170 description: None,
171 og_type: String::from(DEFAULT_OG_TYPE),
172 additional_tags: Vec::new(),
173 }
174 }
175
176 #[must_use]
191 pub fn with_title(mut self, title: impl Into<String>) -> Self {
192 self.title = Some(title.into());
193 self
194 }
195
196 #[must_use]
211 pub fn with_description(mut self, desc: impl Into<String>) -> Self {
212 self.description = Some(desc.into());
213 self
214 }
215
216 #[must_use]
232 pub fn add_meta_tag(
233 mut self,
234 name: impl Into<String>,
235 content: impl Into<String>,
236 ) -> Self {
237 self.additional_tags.push((name.into(), content.into()));
238 self
239 }
240
241 #[must_use]
261 pub fn add_meta_tags<I>(mut self, tags: I) -> Self
262 where
263 I: IntoIterator<Item = (String, String)>,
264 {
265 self.additional_tags.extend(tags);
266 self
267 }
268
269 pub fn build(self) -> Result<String> {
288 let title = self.title.ok_or_else(|| {
289 HtmlError::seo(
290 SeoErrorKind::MissingTitle,
291 "Meta title is required",
292 None,
293 )
294 })?;
295
296 let description = self.description.ok_or_else(|| {
297 HtmlError::seo(
298 SeoErrorKind::MissingDescription,
299 "Meta description is required",
300 None,
301 )
302 })?;
303
304 let mut meta_tags = String::with_capacity(500);
305
306 meta_tags.push_str(&format!(
308 r#"<meta name="title" content="{}">"#,
309 escape_html(&title)
310 ));
311 meta_tags.push_str(&format!(
312 r#"<meta name="description" content="{}">"#,
313 escape_html(&description)
314 ));
315 meta_tags.push_str(&format!(
316 r#"<meta property="og:type" content="{}">"#,
317 escape_html(&self.og_type)
318 ));
319
320 for (name, content) in self.additional_tags {
322 meta_tags.push_str(&format!(
323 r#"<meta name="{}" content="{}">"#,
324 escape_html(&name),
325 escape_html(&content)
326 ));
327 }
328
329 Ok(meta_tags)
330 }
331}
332
333fn validate_page_type(page_type: &str) -> Result<()> {
339 if page_type.is_empty() {
340 return Err(HtmlError::seo(
341 SeoErrorKind::InvalidStructuredData,
342 "Page type cannot be empty",
343 None,
344 ));
345 }
346 Ok(())
347}
348
349#[must_use]
380pub fn escape_html(s: &str) -> Cow<'_, str> {
381 if !s
387 .bytes()
388 .any(|b| matches!(b, b'&' | b'<' | b'>' | b'"' | b'\''))
389 {
390 return Cow::Borrowed(s);
391 }
392 HTML_ESCAPES.replace_all(s, |caps: &Captures| match &caps[0] {
396 "&" => "&",
397 "<" => "<",
398 ">" => ">",
399 "\"" => """,
400 _ => "'",
401 })
402}
403
404pub fn generate_meta_tags(html: &str) -> Result<String> {
430 if html.len() > MAX_HTML_SIZE {
431 return Err(HtmlError::InputTooLarge(html.len()));
432 }
433
434 let document = Html::parse_document(html);
435 let title = extract_title(&document)?;
436 let description = extract_description(&document)?;
437
438 MetaTagsBuilder::new()
439 .with_title(title)
440 .with_description(description)
441 .build()
442}
443
444pub fn generate_structured_data(
473 html: &str,
474 config: Option<StructuredDataConfig>,
475) -> Result<String> {
476 if html.len() > MAX_HTML_SIZE {
477 return Err(HtmlError::InputTooLarge(html.len()));
478 }
479
480 let document = Html::parse_document(html);
481 generate_structured_data_from_doc(&document, config)
482}
483
484pub fn generate_structured_data_from_doc(
507 document: &Html,
508 config: Option<StructuredDataConfig>,
509) -> Result<String> {
510 let config = config.unwrap_or_default();
511 config.validate()?;
512
513 let title = extract_title(document)?;
514 let description = extract_description(document)?;
515
516 let mut json = if config.additional_types.is_empty() {
517 json!({
518 "@context": SCHEMA_ORG_CONTEXT,
519 "@type": config.page_type,
520 "name": title,
521 "description": description,
522 })
523 } else {
524 let mut types = vec![config.page_type];
525 types.extend(config.additional_types);
526 json!({
527 "@context": SCHEMA_ORG_CONTEXT,
528 "@type": types,
529 "name": title,
530 "description": description,
531 })
532 };
533
534 if let Some(additional_data) = config.additional_data {
535 for (key, value) in additional_data {
536 json[key] = json!(value);
537 }
538 }
539
540 Ok(format!(
543 r#"<script type="application/ld+json">{}</script>"#,
544 serde_json::to_string(&json).map_err(|e| {
545 HtmlError::InvalidStructuredData(e.to_string())
546 })?
547 ))
548}
549
550fn extract_title(document: &Html) -> Result<String> {
552 document
553 .select(&TITLE_SELECTOR)
554 .next()
555 .map(|t| t.text().collect::<String>())
556 .ok_or_else(|| {
557 HtmlError::MissingHtmlElement("title".to_string())
558 })
559}
560
561fn extract_description(document: &Html) -> Result<String> {
562 if let Some(meta) = document.select(&META_DESC_SELECTOR).next() {
564 if let Some(content) = meta.value().attr("content") {
565 return Ok(content.to_string());
566 }
567 }
568
569 document
571 .select(&PARAGRAPH_SELECTOR)
572 .next()
573 .map(|p| p.text().collect::<String>())
574 .ok_or_else(|| {
575 HtmlError::MissingHtmlElement("description".to_string())
576 })
577}
578
579#[cfg(test)]
580mod tests {
581 use super::*;
582 use test_case::test_case as case;
583
584 mod meta_tags_builder {
586 use super::*;
587
588 #[test]
589 fn handles_duplicate_meta_tags() {
590 let meta_tags = MetaTagsBuilder::new()
591 .with_title("Duplicate Test")
592 .with_description("Testing duplicates")
593 .add_meta_tag("author", "John Doe")
594 .add_meta_tag("author", "Jane Doe")
595 .build()
596 .unwrap();
597
598 assert!(meta_tags.contains(r#"content="John Doe""#));
599 assert!(meta_tags.contains(r#"content="Jane Doe""#));
600 }
601
602 #[test]
603 fn handles_multiple_add_meta_tags_calls() {
604 let mut builder = MetaTagsBuilder::new()
605 .with_title("Test")
606 .with_description("Description");
607 builder = builder.add_meta_tags(vec![(
608 "key1".to_string(),
609 "value1".to_string(),
610 )]);
611 builder = builder.add_meta_tags(vec![(
612 "key2".to_string(),
613 "value2".to_string(),
614 )]);
615 let meta_tags = builder.build().unwrap();
616
617 assert!(meta_tags.contains(r#"content="value1""#));
618 assert!(meta_tags.contains(r#"content="value2""#));
619 }
620
621 #[test]
622 fn builds_basic_meta_tags() {
623 let meta_tags = MetaTagsBuilder::new()
624 .with_title("Test Title")
625 .with_description("Test Description")
626 .add_meta_tag("keywords", "test,keywords")
627 .build()
628 .unwrap();
629
630 assert!(meta_tags.contains(
631 r#"<meta name="title" content="Test Title">"#
632 ));
633 assert!(meta_tags.contains(r#"<meta name="description" content="Test Description">"#));
634 assert!(meta_tags.contains(
635 r#"<meta name="keywords" content="test,keywords">"#
636 ));
637 }
638
639 #[test]
640 fn handles_multiple_meta_tags() {
641 let tags = vec![
642 ("keywords".to_string(), "test,tags".to_string()),
643 ("robots".to_string(), "index,follow".to_string()),
644 ];
645 let meta_tags = MetaTagsBuilder::new()
646 .with_title("Test")
647 .with_description("Test")
648 .add_meta_tags(tags)
649 .build()
650 .unwrap();
651
652 assert!(
653 meta_tags.contains(r#"keywords" content="test,tags"#)
654 );
655 assert!(
656 meta_tags.contains(r#"robots" content="index,follow"#)
657 );
658 }
659
660 #[test]
661 fn fails_without_title() {
662 let result = MetaTagsBuilder::new()
663 .with_description("Test Description")
664 .build();
665
666 assert!(matches!(
667 result,
668 Err(HtmlError::Seo {
669 kind: SeoErrorKind::MissingTitle,
670 ..
671 })
672 ));
673 }
674
675 #[test]
676 fn fails_without_description() {
677 let result =
678 MetaTagsBuilder::new().with_title("Test Title").build();
679
680 assert!(matches!(
681 result,
682 Err(HtmlError::Seo {
683 kind: SeoErrorKind::MissingDescription,
684 ..
685 })
686 ));
687 }
688
689 #[test]
690 fn escapes_special_characters_in_meta_tags() {
691 let meta_tags = MetaTagsBuilder::new()
692 .with_title("Test & Title")
693 .with_description("Test < Description >")
694 .build()
695 .unwrap();
696
697 assert!(meta_tags.contains(r#"content="Test & Title"#));
698 assert!(meta_tags
699 .contains(r#"content="Test < Description >"#));
700 }
701 }
702
703 mod html_escaping {
705 use super::*;
706
707 #[case("<>&\"'" => "<>&"'" ; "escapes all special characters")]
708 #[case("Normal text" => "Normal text" ; "leaves normal text unchanged")]
709 #[case("" => "" ; "handles empty string")]
710 fn escape_html_cases(input: &str) -> String {
711 escape_html(input).into_owned()
712 }
713
714 #[test]
715 fn escapes_mixed_content() {
716 let input = "Text with <tags> & \"quotes\" 'here'";
717 let expected = "Text with <tags> & "quotes" 'here'";
718 assert_eq!(escape_html(input), expected);
719 }
720
721 #[test]
722 fn handles_large_input() {
723 let large_input = "<>".repeat(100_000);
724 let escaped = escape_html(&large_input);
725 assert!(escaped.contains("<>"));
726 }
727 }
728
729 mod structured_data {
731 use super::*;
732
733 #[test]
734 fn handles_deeply_nested_configuration() {
735 let html = r"<html><head><title>Nested Test</title></head><body><p>Description</p></body></html>";
736 let mut additional_data = HashMap::new();
737 _ = additional_data
738 .insert("level1".to_string(), "value1".to_string());
739 _ = additional_data
740 .insert("level2".to_string(), "value2".to_string());
741
742 let config = StructuredDataConfig {
743 page_type: "TestType".to_string(),
744 additional_types: vec!["ExtraType".to_string()],
745 additional_data: Some(additional_data),
746 };
747
748 let result =
749 generate_structured_data(html, Some(config)).unwrap();
750 let json_content = extract_json_from_script(&result);
751 let parsed: serde_json::Value =
752 serde_json::from_str(&json_content).unwrap();
753
754 assert_eq!(
755 parsed["@type"],
756 serde_json::json!(["TestType", "ExtraType"])
757 );
758 assert_eq!(parsed["level1"], "value1");
759 assert_eq!(parsed["level2"], "value2");
760 }
761
762 #[test]
763 fn generates_basic_structured_data() {
764 let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
765 let result = generate_structured_data(html, None).unwrap();
766
767 let json_content = extract_json_from_script(&result);
768 let parsed: serde_json::Value =
769 serde_json::from_str(&json_content).unwrap();
770
771 assert_eq!(parsed["@type"], "WebPage");
772 assert_eq!(parsed["name"], "Test");
773 assert_eq!(parsed["description"], "Description");
774 }
775
776 #[test]
777 fn generates_multiple_types() {
778 let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
779 let config = StructuredDataConfig {
780 page_type: "Article".to_string(),
781 additional_types: vec!["WebPage".to_string()],
782 additional_data: Some(HashMap::from([(
783 "author".to_string(),
784 "Test Author".to_string(),
785 )])),
786 };
787
788 let result =
789 generate_structured_data(html, Some(config)).unwrap();
790 let json_content = extract_json_from_script(&result);
791 let parsed: serde_json::Value =
792 serde_json::from_str(&json_content).unwrap();
793
794 assert_eq!(
795 parsed["@type"],
796 serde_json::json!(["Article", "WebPage"]),
797 "Expected @type to include multiple types"
798 );
799 assert_eq!(
800 parsed["author"], "Test Author",
801 "Expected author to be included"
802 );
803 }
804
805 #[test]
806 fn validates_config() {
807 let empty_type = StructuredDataConfig {
808 page_type: "".to_string(),
809 ..Default::default()
810 };
811 assert!(empty_type.validate().is_err());
812
813 let empty_additional = StructuredDataConfig {
814 additional_types: vec!["".to_string()],
815 ..Default::default()
816 };
817 assert!(empty_additional.validate().is_err());
818 }
819
820 fn extract_json_from_script(script: &str) -> String {
822 let json_start =
823 script.find('{').expect("JSON should start with '{'");
824 let json_end =
825 script.rfind('}').expect("JSON should end with '}'");
826 script[json_start..=json_end].to_string()
827 }
828 }
829
830 mod input_validation {
832 use super::*;
833
834 #[test]
835 fn enforces_size_limit_for_meta_tags() {
836 let large_html = "a".repeat(MAX_HTML_SIZE + 1);
837 assert!(matches!(
838 generate_meta_tags(&large_html),
839 Err(HtmlError::InputTooLarge(_))
840 ));
841 }
842
843 #[test]
844 fn enforces_size_limit_for_structured_data() {
845 let large_html = "a".repeat(MAX_HTML_SIZE + 1);
846 assert!(matches!(
847 generate_structured_data(&large_html, None),
848 Err(HtmlError::InputTooLarge(_))
849 ));
850 }
851
852 #[test]
853 fn handles_missing_title() {
854 let html =
855 r"<html><body><p>No title here</p></body></html>";
856 assert!(matches!(
857 generate_meta_tags(html),
858 Err(HtmlError::MissingHtmlElement(ref e)) if e == "title"
859 ));
860 }
861
862 #[test]
863 fn handles_missing_description() {
864 let html =
865 r"<html><head><title>Title only</title></head></html>";
866 assert!(matches!(
867 generate_meta_tags(html),
868 Err(HtmlError::MissingHtmlElement(ref e)) if e == "description"
869 ));
870 }
871
872 #[test]
873 fn invalid_additional_data_keys() {
874 let mut additional_data = HashMap::new();
875 _ = additional_data
876 .insert("<invalid>".to_string(), "value".to_string());
877 let config = StructuredDataConfig {
878 additional_data: Some(additional_data),
879 ..Default::default()
880 };
881 let result =
882 generate_structured_data("<html></html>", Some(config));
883 assert!(result.is_err());
884 }
885 }
886}