Skip to main content

html_generator/
seo.rs

1// Copyright © 2025 HTML Generator. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Search Engine Optimization (SEO) functionality for HTML processing.
5//!
6//! This module provides tools for improving the SEO of web pages through automated
7//! meta tag generation and structured data implementation. It includes features for:
8//! - Meta tag generation for improved search engine visibility
9//! - Structured data (JSON-LD) generation for rich search results
10//! - HTML content analysis for SEO optimization
11//! - Safe HTML entity escaping
12//!
13//! # Examples
14//!
15//! ```rust
16//! use html_generator::seo::{MetaTagsBuilder, generate_structured_data};
17//!
18//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
19//! let html = r#"<html><head><title>My Page</title></head><body><p>Content</p></body></html>"#;
20//!
21//! // Generate meta tags
22//! let meta_tags = MetaTagsBuilder::new()
23//!     .with_title("My Page")
24//!     .with_description("Page content")
25//!     .build()?;
26//!
27//! // Generate structured data
28//! let structured_data = generate_structured_data(html, None)?;
29//! # Ok(())
30//! # }
31//! ```
32
33use serde_json::json;
34use std::borrow::Cow;
35use std::collections::HashMap;
36
37use crate::error::{HtmlError, Result, SeoErrorKind};
38use once_cell::sync::Lazy;
39use regex::{Captures, Regex};
40use scraper::{Html, Selector};
41
42// Constants
43/// Maximum allowed size for HTML input (1MB)
44const MAX_HTML_SIZE: usize = 1_000_000;
45/// Default page type for structured data
46const DEFAULT_PAGE_TYPE: &str = "WebPage";
47/// Schema.org context URL
48const SCHEMA_ORG_CONTEXT: &str = "https://schema.org";
49/// Default OpenGraph type
50const DEFAULT_OG_TYPE: &str = "website";
51
52/// Regex for matching HTML special characters
53static HTML_ESCAPES: Lazy<Regex> = Lazy::new(|| {
54    Regex::new(r#"[&<>"']"#).expect("static HTML_ESCAPES must compile")
55});
56
57/// Selector for extracting meta description
58static META_DESC_SELECTOR: Lazy<Selector> = Lazy::new(|| {
59    Selector::parse("meta[name='description']")
60        .expect("static META_DESC_SELECTOR must parse")
61});
62
63/// Selector for extracting title
64static TITLE_SELECTOR: Lazy<Selector> = Lazy::new(|| {
65    Selector::parse("title").expect("static TITLE_SELECTOR must parse")
66});
67
68/// Selector for extracting paragraphs
69static PARAGRAPH_SELECTOR: Lazy<Selector> = Lazy::new(|| {
70    Selector::parse("p").expect("static PARAGRAPH_SELECTOR must parse")
71});
72
73/// Configuration options for structured data generation.
74///
75/// # Examples
76///
77/// ```
78/// use html_generator::seo::StructuredDataConfig;
79///
80/// let cfg = StructuredDataConfig {
81///     page_type: "Article".into(),
82///     additional_types: vec!["NewsArticle".into()],
83///     additional_data: None,
84/// };
85/// assert_eq!(cfg.page_type, "Article");
86/// ```
87#[derive(Debug, Clone)]
88pub struct StructuredDataConfig {
89    /// Additional key-value pairs to include in the structured data
90    pub additional_data: Option<HashMap<String, String>>,
91    /// The type of webpage (e.g., "WebPage", "Article", "Product")
92    pub page_type: String,
93    /// Additional schema.org types to include
94    pub additional_types: Vec<String>,
95}
96
97impl Default for StructuredDataConfig {
98    fn default() -> Self {
99        Self {
100            additional_data: None,
101            page_type: String::from(DEFAULT_PAGE_TYPE),
102            additional_types: Vec::new(),
103        }
104    }
105}
106
107impl StructuredDataConfig {
108    /// Validates the configuration.
109    ///
110    /// # Errors
111    ///
112    /// Returns an error if:
113    /// - The page type is empty
114    /// - Any additional type is empty
115    fn validate(&self) -> Result<()> {
116        validate_page_type(&self.page_type)?;
117
118        if self.additional_types.iter().any(String::is_empty) {
119            return Err(HtmlError::seo(
120                SeoErrorKind::InvalidStructuredData,
121                "Additional types cannot be empty",
122                None,
123            ));
124        }
125        Ok(())
126    }
127}
128
129/// Builder for constructing meta tags.
130///
131/// # Examples
132///
133/// ```
134/// use html_generator::seo::MetaTagsBuilder;
135///
136/// let tags = MetaTagsBuilder::new()
137///     .with_title("My Page")
138///     .with_description("An example page")
139///     .build()
140///     .unwrap();
141/// assert!(tags.contains(r#"name="title""#));
142/// assert!(tags.contains(r#"name="description""#));
143/// ```
144#[derive(Debug, Default)]
145pub struct MetaTagsBuilder {
146    /// Title for the meta tags
147    title: Option<String>,
148    /// Description for the meta tags
149    description: Option<String>,
150    /// OpenGraph type
151    og_type: String,
152    /// Additional meta tags
153    additional_tags: Vec<(String, String)>,
154}
155
156impl MetaTagsBuilder {
157    /// Creates a new `MetaTagsBuilder` with default values.
158    ///
159    /// # Examples
160    ///
161    /// ```
162    /// use html_generator::seo::MetaTagsBuilder;
163    ///
164    /// let _ = MetaTagsBuilder::new();
165    /// ```
166    #[must_use]
167    pub fn new() -> Self {
168        Self {
169            title: None,
170            description: None,
171            og_type: String::from(DEFAULT_OG_TYPE),
172            additional_tags: Vec::new(),
173        }
174    }
175
176    /// Sets the title for the meta tags.
177    ///
178    /// # Examples
179    ///
180    /// ```
181    /// use html_generator::seo::MetaTagsBuilder;
182    ///
183    /// let tags = MetaTagsBuilder::new()
184    ///     .with_title("Home")
185    ///     .with_description("welcome")
186    ///     .build()
187    ///     .unwrap();
188    /// assert!(tags.contains(r#"content="Home""#));
189    /// ```
190    #[must_use]
191    pub fn with_title(mut self, title: impl Into<String>) -> Self {
192        self.title = Some(title.into());
193        self
194    }
195
196    /// Sets the description for the meta tags.
197    ///
198    /// # Examples
199    ///
200    /// ```
201    /// use html_generator::seo::MetaTagsBuilder;
202    ///
203    /// let tags = MetaTagsBuilder::new()
204    ///     .with_title("t")
205    ///     .with_description("an example description")
206    ///     .build()
207    ///     .unwrap();
208    /// assert!(tags.contains("an example description"));
209    /// ```
210    #[must_use]
211    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
212        self.description = Some(desc.into());
213        self
214    }
215
216    /// Adds an additional meta tag.
217    ///
218    /// # Examples
219    ///
220    /// ```
221    /// use html_generator::seo::MetaTagsBuilder;
222    ///
223    /// let tags = MetaTagsBuilder::new()
224    ///     .with_title("t")
225    ///     .with_description("d")
226    ///     .add_meta_tag("author", "Jane Doe")
227    ///     .build()
228    ///     .unwrap();
229    /// assert!(tags.contains(r#"name="author" content="Jane Doe""#));
230    /// ```
231    #[must_use]
232    pub fn add_meta_tag(
233        mut self,
234        name: impl Into<String>,
235        content: impl Into<String>,
236    ) -> Self {
237        self.additional_tags.push((name.into(), content.into()));
238        self
239    }
240
241    /// Adds multiple meta tags at once.
242    ///
243    /// # Examples
244    ///
245    /// ```
246    /// use html_generator::seo::MetaTagsBuilder;
247    ///
248    /// let extra = vec![
249    ///     ("author".to_string(), "Jane".to_string()),
250    ///     ("robots".to_string(), "index,follow".to_string()),
251    /// ];
252    /// let tags = MetaTagsBuilder::new()
253    ///     .with_title("t")
254    ///     .with_description("d")
255    ///     .add_meta_tags(extra)
256    ///     .build()
257    ///     .unwrap();
258    /// assert!(tags.contains(r#"name="robots""#));
259    /// ```
260    #[must_use]
261    pub fn add_meta_tags<I>(mut self, tags: I) -> Self
262    where
263        I: IntoIterator<Item = (String, String)>,
264    {
265        self.additional_tags.extend(tags);
266        self
267    }
268
269    /// Builds the meta tags string.
270    ///
271    /// # Errors
272    ///
273    /// Returns an error if required fields (title or description) are missing.
274    ///
275    /// # Examples
276    ///
277    /// ```
278    /// use html_generator::seo::MetaTagsBuilder;
279    ///
280    /// let tags = MetaTagsBuilder::new()
281    ///     .with_title("Home")
282    ///     .with_description("welcome")
283    ///     .build()
284    ///     .unwrap();
285    /// assert!(tags.starts_with("<meta"));
286    /// ```
287    pub fn build(self) -> Result<String> {
288        let title = self.title.ok_or_else(|| {
289            HtmlError::seo(
290                SeoErrorKind::MissingTitle,
291                "Meta title is required",
292                None,
293            )
294        })?;
295
296        let description = self.description.ok_or_else(|| {
297            HtmlError::seo(
298                SeoErrorKind::MissingDescription,
299                "Meta description is required",
300                None,
301            )
302        })?;
303
304        let mut meta_tags = String::with_capacity(500);
305
306        // Add required meta tags
307        meta_tags.push_str(&format!(
308            r#"<meta name="title" content="{}">"#,
309            escape_html(&title)
310        ));
311        meta_tags.push_str(&format!(
312            r#"<meta name="description" content="{}">"#,
313            escape_html(&description)
314        ));
315        meta_tags.push_str(&format!(
316            r#"<meta property="og:type" content="{}">"#,
317            escape_html(&self.og_type)
318        ));
319
320        // Add additional meta tags
321        for (name, content) in self.additional_tags {
322            meta_tags.push_str(&format!(
323                r#"<meta name="{}" content="{}">"#,
324                escape_html(&name),
325                escape_html(&content)
326            ));
327        }
328
329        Ok(meta_tags)
330    }
331}
332
333/// Validates that a page type is not empty.
334///
335/// # Errors
336///
337/// Returns an error if the page type is empty.
338fn validate_page_type(page_type: &str) -> Result<()> {
339    if page_type.is_empty() {
340        return Err(HtmlError::seo(
341            SeoErrorKind::InvalidStructuredData,
342            "Page type cannot be empty",
343            None,
344        ));
345    }
346    Ok(())
347}
348
349/// Escapes HTML special characters in a string.
350///
351/// This function replaces special characters with their HTML entity equivalents:
352/// - `&` becomes `&amp;`
353/// - `<` becomes `&lt;`
354/// - `>` becomes `&gt;`
355/// - `"` becomes `&quot;`
356/// - `'` becomes `&#x27;`
357///
358/// # Arguments
359///
360/// * `s` - The string to escape
361///
362/// # Returns
363///
364/// Returns a `Cow<str>` containing either the original string if no escaping was
365/// needed, or a new string with escaped characters.
366///
367/// # Examples
368///
369/// ```
370/// use html_generator::seo::escape_html;
371///
372/// let input = r#"<script>alert("Hello & goodbye")</script>"#;
373/// let escaped = escape_html(input);
374/// assert_eq!(
375///     escaped,
376///     r#"&lt;script&gt;alert(&quot;Hello &amp; goodbye&quot;)&lt;/script&gt;"#
377/// );
378/// ```
379#[must_use]
380pub fn escape_html(s: &str) -> Cow<'_, str> {
381    // Fast path: most attribute/text values contain none of the
382    // reserved bytes. `bytes().any` lowers to a SIMD-accelerated scan
383    // on Apple Silicon (NEON) and x86-64 (AVX2) via the stdlib's
384    // `memchr`, which is an order of magnitude faster than entering
385    // the regex engine to discover there is nothing to replace.
386    if !s
387        .bytes()
388        .any(|b| matches!(b, b'&' | b'<' | b'>' | b'"' | b'\''))
389    {
390        return Cow::Borrowed(s);
391    }
392    // The regex matches exactly the 5 characters below; the wildcard
393    // arm is saturated by the single-quote case to avoid an
394    // unreachable/dead branch in coverage reports.
395    HTML_ESCAPES.replace_all(s, |caps: &Captures| match &caps[0] {
396        "&" => "&amp;",
397        "<" => "&lt;",
398        ">" => "&gt;",
399        "\"" => "&quot;",
400        _ => "&#x27;",
401    })
402}
403
404/// Generates meta tags for SEO purposes.
405///
406/// # Arguments
407///
408/// * `html` - The HTML content to analyze
409///
410/// # Returns
411///
412/// Returns a `Result` containing the generated meta tags as a string.
413///
414/// # Errors
415///
416/// Returns an error if:
417/// - The HTML input is too large (> 1MB)
418/// - Required elements (title, description) are missing
419///
420/// # Examples
421///
422/// ```
423/// use html_generator::seo::generate_meta_tags;
424///
425/// let html = r#"<html><head><title>Test</title></head><body><p>Content</p></body></html>"#;
426/// let meta_tags = generate_meta_tags(html)?;
427/// # Ok::<(), html_generator::error::HtmlError>(())
428/// ```
429pub fn generate_meta_tags(html: &str) -> Result<String> {
430    if html.len() > MAX_HTML_SIZE {
431        return Err(HtmlError::InputTooLarge(html.len()));
432    }
433
434    let document = Html::parse_document(html);
435    let title = extract_title(&document)?;
436    let description = extract_description(&document)?;
437
438    MetaTagsBuilder::new()
439        .with_title(title)
440        .with_description(description)
441        .build()
442}
443
444/// Generates structured data (JSON-LD) for SEO purposes.
445///
446/// # Arguments
447///
448/// * `html` - The HTML content to analyze
449/// * `config` - Optional configuration for structured data generation
450///
451/// # Returns
452///
453/// Returns a `Result` containing the generated JSON-LD script as a string.
454///
455/// # Errors
456///
457/// Returns an error if:
458/// - The HTML input is too large (> 1MB)
459/// - Required elements are missing
460/// - JSON serialization fails
461/// - Configuration validation fails
462///
463/// # Examples
464///
465/// ```
466/// use html_generator::seo::generate_structured_data;
467///
468/// let html = r#"<html><head><title>Test</title></head><body><p>Content</p></body></html>"#;
469/// let structured_data = generate_structured_data(html, None)?;
470/// # Ok::<(), html_generator::error::HtmlError>(())
471/// ```
472pub fn generate_structured_data(
473    html: &str,
474    config: Option<StructuredDataConfig>,
475) -> Result<String> {
476    if html.len() > MAX_HTML_SIZE {
477        return Err(HtmlError::InputTooLarge(html.len()));
478    }
479
480    let document = Html::parse_document(html);
481    generate_structured_data_from_doc(&document, config)
482}
483
484/// Generates structured data from a pre-parsed DOM tree.
485///
486/// This avoids redundant parsing when the pipeline has already parsed
487/// the HTML for other steps.
488///
489/// # Examples
490///
491/// ```
492/// use html_generator::seo::generate_structured_data_from_doc;
493/// use scraper::Html;
494///
495/// let doc = Html::parse_document(
496///     "<html><head><title>Hi</title></head><body><p>Body</p></body></html>",
497/// );
498/// let json_ld = generate_structured_data_from_doc(&doc, None).unwrap();
499/// assert!(json_ld.contains("application/ld+json"));
500/// ```
501///
502/// # Errors
503///
504/// Same as [`generate_structured_data`]: missing title/description,
505/// failed config validation, or JSON serialization failure.
506pub fn generate_structured_data_from_doc(
507    document: &Html,
508    config: Option<StructuredDataConfig>,
509) -> Result<String> {
510    let config = config.unwrap_or_default();
511    config.validate()?;
512
513    let title = extract_title(document)?;
514    let description = extract_description(document)?;
515
516    let mut json = if config.additional_types.is_empty() {
517        json!({
518            "@context": SCHEMA_ORG_CONTEXT,
519            "@type": config.page_type,
520            "name": title,
521            "description": description,
522        })
523    } else {
524        let mut types = vec![config.page_type];
525        types.extend(config.additional_types);
526        json!({
527            "@context": SCHEMA_ORG_CONTEXT,
528            "@type": types,
529            "name": title,
530            "description": description,
531        })
532    };
533
534    if let Some(additional_data) = config.additional_data {
535        for (key, value) in additional_data {
536            json[key] = json!(value);
537        }
538    }
539
540    // Compact output — consumers are parsers, not humans. Roughly ~30%
541    // smaller and ~2× faster than `to_string_pretty`.
542    Ok(format!(
543        r#"<script type="application/ld+json">{}</script>"#,
544        serde_json::to_string(&json).map_err(|e| {
545            HtmlError::InvalidStructuredData(e.to_string())
546        })?
547    ))
548}
549
550// Private helper functions
551fn extract_title(document: &Html) -> Result<String> {
552    document
553        .select(&TITLE_SELECTOR)
554        .next()
555        .map(|t| t.text().collect::<String>())
556        .ok_or_else(|| {
557            HtmlError::MissingHtmlElement("title".to_string())
558        })
559}
560
561fn extract_description(document: &Html) -> Result<String> {
562    // Try meta description first
563    if let Some(meta) = document.select(&META_DESC_SELECTOR).next() {
564        if let Some(content) = meta.value().attr("content") {
565            return Ok(content.to_string());
566        }
567    }
568
569    // Fall back to first paragraph
570    document
571        .select(&PARAGRAPH_SELECTOR)
572        .next()
573        .map(|p| p.text().collect::<String>())
574        .ok_or_else(|| {
575            HtmlError::MissingHtmlElement("description".to_string())
576        })
577}
578
579#[cfg(test)]
580mod tests {
581    use super::*;
582    use test_case::test_case as case;
583
584    /// Tests for MetaTagsBuilder functionality
585    mod meta_tags_builder {
586        use super::*;
587
588        #[test]
589        fn handles_duplicate_meta_tags() {
590            let meta_tags = MetaTagsBuilder::new()
591                .with_title("Duplicate Test")
592                .with_description("Testing duplicates")
593                .add_meta_tag("author", "John Doe")
594                .add_meta_tag("author", "Jane Doe")
595                .build()
596                .unwrap();
597
598            assert!(meta_tags.contains(r#"content="John Doe""#));
599            assert!(meta_tags.contains(r#"content="Jane Doe""#));
600        }
601
602        #[test]
603        fn handles_multiple_add_meta_tags_calls() {
604            let mut builder = MetaTagsBuilder::new()
605                .with_title("Test")
606                .with_description("Description");
607            builder = builder.add_meta_tags(vec![(
608                "key1".to_string(),
609                "value1".to_string(),
610            )]);
611            builder = builder.add_meta_tags(vec![(
612                "key2".to_string(),
613                "value2".to_string(),
614            )]);
615            let meta_tags = builder.build().unwrap();
616
617            assert!(meta_tags.contains(r#"content="value1""#));
618            assert!(meta_tags.contains(r#"content="value2""#));
619        }
620
621        #[test]
622        fn builds_basic_meta_tags() {
623            let meta_tags = MetaTagsBuilder::new()
624                .with_title("Test Title")
625                .with_description("Test Description")
626                .add_meta_tag("keywords", "test,keywords")
627                .build()
628                .unwrap();
629
630            assert!(meta_tags.contains(
631                r#"<meta name="title" content="Test Title">"#
632            ));
633            assert!(meta_tags.contains(r#"<meta name="description" content="Test Description">"#));
634            assert!(meta_tags.contains(
635                r#"<meta name="keywords" content="test,keywords">"#
636            ));
637        }
638
639        #[test]
640        fn handles_multiple_meta_tags() {
641            let tags = vec![
642                ("keywords".to_string(), "test,tags".to_string()),
643                ("robots".to_string(), "index,follow".to_string()),
644            ];
645            let meta_tags = MetaTagsBuilder::new()
646                .with_title("Test")
647                .with_description("Test")
648                .add_meta_tags(tags)
649                .build()
650                .unwrap();
651
652            assert!(
653                meta_tags.contains(r#"keywords" content="test,tags"#)
654            );
655            assert!(
656                meta_tags.contains(r#"robots" content="index,follow"#)
657            );
658        }
659
660        #[test]
661        fn fails_without_title() {
662            let result = MetaTagsBuilder::new()
663                .with_description("Test Description")
664                .build();
665
666            assert!(matches!(
667                result,
668                Err(HtmlError::Seo {
669                    kind: SeoErrorKind::MissingTitle,
670                    ..
671                })
672            ));
673        }
674
675        #[test]
676        fn fails_without_description() {
677            let result =
678                MetaTagsBuilder::new().with_title("Test Title").build();
679
680            assert!(matches!(
681                result,
682                Err(HtmlError::Seo {
683                    kind: SeoErrorKind::MissingDescription,
684                    ..
685                })
686            ));
687        }
688
689        #[test]
690        fn escapes_special_characters_in_meta_tags() {
691            let meta_tags = MetaTagsBuilder::new()
692                .with_title("Test & Title")
693                .with_description("Test < Description >")
694                .build()
695                .unwrap();
696
697            assert!(meta_tags.contains(r#"content="Test &amp; Title"#));
698            assert!(meta_tags
699                .contains(r#"content="Test &lt; Description &gt;"#));
700        }
701    }
702
703    /// Tests for HTML escaping functionality
704    mod html_escaping {
705        use super::*;
706
707        #[case("<>&\"'" => "&lt;&gt;&amp;&quot;&#x27;" ; "escapes all special characters")]
708        #[case("Normal text" => "Normal text" ; "leaves normal text unchanged")]
709        #[case("" => "" ; "handles empty string")]
710        fn escape_html_cases(input: &str) -> String {
711            escape_html(input).into_owned()
712        }
713
714        #[test]
715        fn escapes_mixed_content() {
716            let input = "Text with <tags> & \"quotes\" 'here'";
717            let expected = "Text with &lt;tags&gt; &amp; &quot;quotes&quot; &#x27;here&#x27;";
718            assert_eq!(escape_html(input), expected);
719        }
720
721        #[test]
722        fn handles_large_input() {
723            let large_input = "<>".repeat(100_000);
724            let escaped = escape_html(&large_input);
725            assert!(escaped.contains("&lt;&gt;"));
726        }
727    }
728
729    /// Tests for structured data functionality
730    mod structured_data {
731        use super::*;
732
733        #[test]
734        fn handles_deeply_nested_configuration() {
735            let html = r"<html><head><title>Nested Test</title></head><body><p>Description</p></body></html>";
736            let mut additional_data = HashMap::new();
737            _ = additional_data
738                .insert("level1".to_string(), "value1".to_string());
739            _ = additional_data
740                .insert("level2".to_string(), "value2".to_string());
741
742            let config = StructuredDataConfig {
743                page_type: "TestType".to_string(),
744                additional_types: vec!["ExtraType".to_string()],
745                additional_data: Some(additional_data),
746            };
747
748            let result =
749                generate_structured_data(html, Some(config)).unwrap();
750            let json_content = extract_json_from_script(&result);
751            let parsed: serde_json::Value =
752                serde_json::from_str(&json_content).unwrap();
753
754            assert_eq!(
755                parsed["@type"],
756                serde_json::json!(["TestType", "ExtraType"])
757            );
758            assert_eq!(parsed["level1"], "value1");
759            assert_eq!(parsed["level2"], "value2");
760        }
761
762        #[test]
763        fn generates_basic_structured_data() {
764            let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
765            let result = generate_structured_data(html, None).unwrap();
766
767            let json_content = extract_json_from_script(&result);
768            let parsed: serde_json::Value =
769                serde_json::from_str(&json_content).unwrap();
770
771            assert_eq!(parsed["@type"], "WebPage");
772            assert_eq!(parsed["name"], "Test");
773            assert_eq!(parsed["description"], "Description");
774        }
775
776        #[test]
777        fn generates_multiple_types() {
778            let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
779            let config = StructuredDataConfig {
780                page_type: "Article".to_string(),
781                additional_types: vec!["WebPage".to_string()],
782                additional_data: Some(HashMap::from([(
783                    "author".to_string(),
784                    "Test Author".to_string(),
785                )])),
786            };
787
788            let result =
789                generate_structured_data(html, Some(config)).unwrap();
790            let json_content = extract_json_from_script(&result);
791            let parsed: serde_json::Value =
792                serde_json::from_str(&json_content).unwrap();
793
794            assert_eq!(
795                parsed["@type"],
796                serde_json::json!(["Article", "WebPage"]),
797                "Expected @type to include multiple types"
798            );
799            assert_eq!(
800                parsed["author"], "Test Author",
801                "Expected author to be included"
802            );
803        }
804
805        #[test]
806        fn validates_config() {
807            let empty_type = StructuredDataConfig {
808                page_type: "".to_string(),
809                ..Default::default()
810            };
811            assert!(empty_type.validate().is_err());
812
813            let empty_additional = StructuredDataConfig {
814                additional_types: vec!["".to_string()],
815                ..Default::default()
816            };
817            assert!(empty_additional.validate().is_err());
818        }
819
820        /// Helper function to extract JSON content from script tags
821        fn extract_json_from_script(script: &str) -> String {
822            let json_start =
823                script.find('{').expect("JSON should start with '{'");
824            let json_end =
825                script.rfind('}').expect("JSON should end with '}'");
826            script[json_start..=json_end].to_string()
827        }
828    }
829
830    /// Tests for input validation and limits
831    mod input_validation {
832        use super::*;
833
834        #[test]
835        fn enforces_size_limit_for_meta_tags() {
836            let large_html = "a".repeat(MAX_HTML_SIZE + 1);
837            assert!(matches!(
838                generate_meta_tags(&large_html),
839                Err(HtmlError::InputTooLarge(_))
840            ));
841        }
842
843        #[test]
844        fn enforces_size_limit_for_structured_data() {
845            let large_html = "a".repeat(MAX_HTML_SIZE + 1);
846            assert!(matches!(
847                generate_structured_data(&large_html, None),
848                Err(HtmlError::InputTooLarge(_))
849            ));
850        }
851
852        #[test]
853        fn handles_missing_title() {
854            let html =
855                r"<html><body><p>No title here</p></body></html>";
856            assert!(matches!(
857                generate_meta_tags(html),
858                Err(HtmlError::MissingHtmlElement(ref e)) if e == "title"
859            ));
860        }
861
862        #[test]
863        fn handles_missing_description() {
864            let html =
865                r"<html><head><title>Title only</title></head></html>";
866            assert!(matches!(
867                generate_meta_tags(html),
868                Err(HtmlError::MissingHtmlElement(ref e)) if e == "description"
869            ));
870        }
871
872        #[test]
873        fn invalid_additional_data_keys() {
874            let mut additional_data = HashMap::new();
875            _ = additional_data
876                .insert("<invalid>".to_string(), "value".to_string());
877            let config = StructuredDataConfig {
878                additional_data: Some(additional_data),
879                ..Default::default()
880            };
881            let result =
882                generate_structured_data("<html></html>", Some(config));
883            assert!(result.is_err());
884        }
885    }
886}