1use crate::error::{HtmlError, Result};
10use crate::seo::escape_html;
11use once_cell::sync::Lazy;
12use regex::Regex;
13use scraper::ElementRef;
14use serde_json::Value;
15use std::collections::HashMap;
16
17static FRONT_MATTER_REGEX: Lazy<Regex> = Lazy::new(|| {
18 Regex::new(r"(?ms)^---\s*\n(.*?)\n---\s*\n")
19 .expect("static FRONT_MATTER_REGEX must compile")
20});
21
22static TOML_FRONT_MATTER_REGEX: Lazy<Regex> = Lazy::new(|| {
23 Regex::new(r"(?ms)^\+\+\+\s*\n(.*?)\n\+\+\+\s*\n")
24 .expect("static TOML_FRONT_MATTER_REGEX must compile")
25});
26
27static HEADER_REGEX: Lazy<Regex> = Lazy::new(|| {
28 Regex::new(r"<(h[1-6])(?:\s[^>]*)?>(.+?)</h[1-6]>")
29 .expect("static HEADER_REGEX must compile")
30});
31
32const MAX_INPUT_SIZE: usize = 1_000_000; pub fn extract_front_matter(content: &str) -> Result<String> {
61 if content.is_empty() {
62 return Err(HtmlError::InvalidInput("Empty input".to_string()));
63 }
64 if content.len() > MAX_INPUT_SIZE {
65 return Err(HtmlError::InputTooLarge(content.len()));
66 }
67
68 if content.starts_with("---") {
69 if let Some(captures) = FRONT_MATTER_REGEX.captures(content) {
70 let front_matter = captures
73 .get(1)
74 .expect("front-matter regex group 1 is mandatory")
75 .as_str();
76
77 for line in front_matter.lines() {
78 let trimmed = line.trim();
79 if trimmed.is_empty() || trimmed.starts_with('#') {
81 continue;
82 }
83 if !trimmed.contains(':') {
84 return Err(HtmlError::InvalidFrontMatterFormat(
85 format!(
86 "Invalid line in front matter: {}",
87 line
88 ),
89 ));
90 }
91 }
92
93 let remaining_content =
94 &content[captures.get(0).map_or(0, |m| m.end())..];
95 Ok(remaining_content.trim().to_string())
96 } else {
97 Err(HtmlError::InvalidFrontMatterFormat(
98 "Invalid front matter format".to_string(),
99 ))
100 }
101 } else {
102 Ok(content.to_string())
103 }
104}
105
106pub fn extract_front_matter_data(
138 content: &str,
139) -> Result<(Value, String)> {
140 if content.is_empty() {
141 return Err(HtmlError::InvalidInput("Empty input".to_string()));
142 }
143 if content.len() > MAX_INPUT_SIZE {
144 return Err(HtmlError::InputTooLarge(content.len()));
145 }
146
147 if content.starts_with("---") {
149 if let Some(captures) = FRONT_MATTER_REGEX.captures(content) {
150 let raw = captures
151 .get(1)
152 .expect("front-matter regex group 1 is mandatory")
153 .as_str();
154
155 let map = parse_yaml_to_map(raw)?;
156 let remaining =
157 &content[captures.get(0).map_or(0, |m| m.end())..];
158 return Ok((
159 Value::Object(map),
160 remaining.trim().to_string(),
161 ));
162 }
163 return Err(HtmlError::InvalidFrontMatterFormat(
164 "Invalid YAML front matter format".to_string(),
165 ));
166 }
167
168 if content.starts_with("+++") {
170 if let Some(captures) =
171 TOML_FRONT_MATTER_REGEX.captures(content)
172 {
173 let raw = captures
174 .get(1)
175 .expect("TOML front-matter regex group 1 is mandatory")
176 .as_str();
177
178 let map = parse_toml_to_map(raw)?;
179 let remaining =
180 &content[captures.get(0).map_or(0, |m| m.end())..];
181 return Ok((
182 Value::Object(map),
183 remaining.trim().to_string(),
184 ));
185 }
186 return Err(HtmlError::InvalidFrontMatterFormat(
187 "Invalid TOML front matter format".to_string(),
188 ));
189 }
190
191 if content.starts_with('{') {
193 if let Some(end) = find_matching_brace(content) {
194 let json_str = &content[..=end];
195 let value: Value =
196 serde_json::from_str(json_str).map_err(|e| {
197 HtmlError::InvalidFrontMatterFormat(format!(
198 "Invalid JSON front matter: {e}"
199 ))
200 })?;
201 let remaining = content[end + 1..].trim_start();
202 return Ok((value, remaining.to_string()));
203 }
204 return Err(HtmlError::InvalidFrontMatterFormat(
205 "Unmatched opening brace in JSON front matter".to_string(),
206 ));
207 }
208
209 Ok((Value::Null, content.to_string()))
211}
212
213fn parse_yaml_to_map(
218 raw: &str,
219) -> Result<serde_json::Map<String, Value>> {
220 let value: Value = crate::yaml::from_str(raw).map_err(|e| {
221 HtmlError::InvalidFrontMatterFormat(format!(
222 "Invalid YAML front matter: {e}"
223 ))
224 })?;
225 match value {
226 Value::Object(map) => Ok(map),
227 _ => Err(HtmlError::InvalidFrontMatterFormat(
228 "YAML front matter must be a mapping".to_string(),
229 )),
230 }
231}
232
233fn parse_toml_to_map(
235 raw: &str,
236) -> Result<serde_json::Map<String, Value>> {
237 let toml_value: toml::Value = toml::from_str(raw).map_err(|e| {
238 HtmlError::InvalidFrontMatterFormat(format!(
239 "Invalid TOML front matter: {e}"
240 ))
241 })?;
242 let json_value: Value =
244 serde_json::to_value(toml_value).map_err(|e| {
245 HtmlError::InvalidFrontMatterFormat(format!(
246 "Failed to convert TOML to JSON: {e}"
247 ))
248 })?;
249 match json_value {
253 Value::Object(map) => Ok(map),
254 _ => Err(HtmlError::InvalidFrontMatterFormat(
255 "TOML document must parse as a table".to_string(),
256 )),
257 }
258}
259
260fn find_matching_brace(content: &str) -> Option<usize> {
262 let mut depth: usize = 0;
263 let mut in_string = false;
264 let mut prev_backslash = false;
265
266 for (i, ch) in content.char_indices() {
267 if in_string {
268 if ch == '\\' && !prev_backslash {
269 prev_backslash = true;
270 continue;
271 }
272 if ch == '"' && !prev_backslash {
273 in_string = false;
274 }
275 prev_backslash = false;
276 continue;
277 }
278 match ch {
279 '"' => in_string = true,
280 '{' => depth += 1,
281 '}' => {
282 depth -= 1;
283 if depth == 0 {
284 return Some(i);
285 }
286 }
287 _ => {}
288 }
289 prev_backslash = false;
290 }
291 None
292}
293
294pub fn format_header_with_id_class(
316 header: &str,
317 id_generator: Option<fn(&str) -> String>,
318 class_generator: Option<fn(&str) -> String>,
319) -> Result<String> {
320 let captures = HEADER_REGEX.captures(header).ok_or_else(|| {
321 HtmlError::InvalidHeaderFormat(
322 "Invalid header format".to_string(),
323 )
324 })?;
325
326 let tag = captures
329 .get(1)
330 .expect("header regex group 1 is mandatory")
331 .as_str();
332
333 let text_content = captures
334 .get(2)
335 .expect("header regex group 2 is mandatory")
336 .as_str();
337
338 let id = id_generator.map_or_else(
339 || generate_id(text_content),
340 |generator| generator(text_content),
341 );
342 let class = class_generator.map_or_else(
343 || generate_id(text_content),
344 |generator| generator(text_content),
345 );
346
347 Ok(format!(
348 r#"<{} id="{}" class="{}">{}</{}>"#,
349 tag, id, class, text_content, tag
350 ))
351}
352
353pub fn generate_table_of_contents(html: &str) -> Result<String> {
373 if html.is_empty() {
374 return Err(HtmlError::InvalidInput("Empty input".to_string()));
375 }
376 if html.len() > MAX_INPUT_SIZE {
377 return Err(HtmlError::InputTooLarge(html.len()));
378 }
379
380 let mut toc = String::new();
381 toc.push_str("<ul>");
382
383 for captures in HEADER_REGEX.captures_iter(html) {
384 if let Some(tag) = captures.get(1) {
385 let content = captures.get(2).map_or("", |m| m.as_str());
386 let id = generate_id(content);
387 toc.push_str(&format!(
388 r#"<li class="toc-{}"><a href="\#{}">{}</a></li>"#,
389 tag.as_str(),
390 id,
391 escape_html(content)
392 ));
393 }
394 }
395
396 toc.push_str("</ul>");
397 Ok(toc)
398}
399
400pub fn is_valid_aria_role(role: &str, element: &ElementRef) -> bool {
424 static VALID_ROLES: Lazy<HashMap<&'static str, Vec<&'static str>>> =
425 Lazy::new(|| {
426 let mut roles = HashMap::new();
427 let _ =
428 roles.insert("a", vec!["link", "button", "menuitem"]);
429 let _ = roles.insert("button", vec!["button"]);
430 let _ =
431 roles.insert("div", vec!["alert", "tooltip", "dialog"]);
432 let _ = roles.insert(
433 "input",
434 vec!["textbox", "radio", "checkbox", "searchbox"],
435 );
436 roles
437 });
438
439 if let Some(valid_roles) = VALID_ROLES.get(element.value().name()) {
440 valid_roles.contains(&role)
441 } else {
442 false
443 }
444}
445
446pub fn is_valid_language_code(lang: &str) -> bool {
467 let parts: Vec<&str> = lang.split('-').collect();
468 if parts.is_empty() || parts[0].len() < 2 || parts[0].len() > 3 {
469 return false;
470 }
471 parts[0].chars().all(|c| c.is_ascii_lowercase())
472}
473
474fn generate_id(content: &str) -> String {
480 let mut out = String::with_capacity(content.len());
481 let mut last_dash = true;
482 for ch in content.chars().flat_map(char::to_lowercase) {
483 if ch.is_alphanumeric() {
484 out.push(ch);
485 last_dash = false;
486 } else if !last_dash {
487 out.push('-');
488 last_dash = true;
489 }
490 }
491 while out.ends_with('-') {
492 let _ = out.pop();
493 }
494 out
495}
496
497#[cfg(test)]
498mod tests {
499 use super::*;
500 use scraper::Html;
501
502 mod extract_front_matter_tests {
504 use super::*;
505
506 #[test]
507 fn test_valid_front_matter() {
508 let content = "---\ntitle: My Page\n---\n# Hello, world!\n\nThis is a test.";
509 let result = extract_front_matter(content);
510 let extracted = result.expect("valid front matter");
511 assert_eq!(extracted, "# Hello, world!\n\nThis is a test.");
512 }
513
514 #[test]
515 fn test_no_front_matter() {
516 let content = "# Hello, world!\n\nThis is a test without front matter.";
517 let result = extract_front_matter(content);
518 let extracted =
519 result.expect("valid no-front-matter input");
520 assert_eq!(extracted, content);
521 }
522
523 #[test]
524 fn test_empty_input() {
525 let content = "";
526 let result = extract_front_matter(content);
527 assert!(matches!(result, Err(HtmlError::InvalidInput(_))));
528 }
529
530 #[test]
531 fn test_exceeding_max_input_size() {
532 let content = "a".repeat(MAX_INPUT_SIZE + 1);
533 let result = extract_front_matter(&content);
534 assert!(matches!(result, Err(HtmlError::InputTooLarge(_))));
535 }
536
537 #[test]
538 fn test_invalid_front_matter_format() {
539 let content =
540 "---\ntitle: value\ninvalid_line\n---\nContent";
541 let result = extract_front_matter(content);
542 assert!(matches!(
543 result,
544 Err(HtmlError::InvalidFrontMatterFormat(_))
545 ));
546 }
547
548 #[test]
549 fn test_valid_front_matter_with_extra_content() {
550 let content = "---\ntitle: Page\n---\n\n# Title\n\nContent";
551 let result = extract_front_matter(content);
552 assert!(result.is_ok());
553 assert_eq!(result.unwrap(), "# Title\n\nContent");
554 }
555
556 #[test]
557 fn test_extract_front_matter_with_mid_document_delimiter() {
558 let content = "# Title\nContent\n---\nkey: value\n---";
559 let result = extract_front_matter(content);
560 assert!(result.is_ok());
561 assert_eq!(result.unwrap(), content);
562 }
563 }
564
565 mod format_header_with_id_class_tests {
567 use super::*;
568
569 #[test]
570 fn test_valid_header_default_generators() {
571 let header = "<h2>Hello, World!</h2>";
572 let result =
573 format_header_with_id_class(header, None, None);
574 let formatted = result.expect("valid header");
575 assert_eq!(formatted, "<h2 id=\"hello-world\" class=\"hello-world\">Hello, World!</h2>");
576 }
577
578 #[test]
579 fn test_custom_id_and_class_generators() {
580 let header = "<h3>Test Header</h3>";
581 fn id_gen(content: &str) -> String {
582 format!(
583 "custom-{}",
584 content.to_lowercase().replace(' ', "-")
585 )
586 }
587 fn class_gen(_: &str) -> String {
588 "custom-class".to_string()
589 }
590 let result = format_header_with_id_class(
591 header,
592 Some(id_gen),
593 Some(class_gen),
594 );
595 let formatted =
596 result.expect("valid header with custom generators");
597 assert_eq!(formatted, "<h3 id=\"custom-test-header\" class=\"custom-class\">Test Header</h3>");
598 }
599
600 #[test]
601 fn test_invalid_header_format() {
602 let header = "<p>Not a header</p>";
603 let result =
604 format_header_with_id_class(header, None, None);
605 assert!(matches!(
606 result,
607 Err(HtmlError::InvalidHeaderFormat(_))
608 ));
609 }
610
611 #[test]
612 fn test_header_with_nested_tags() {
613 let header = "<h2><span>Nested Header</span></h2>";
614 let result =
615 format_header_with_id_class(header, None, None);
616 assert!(result.is_ok());
617 assert_eq!(
618 result.unwrap(),
619 "<h2 id=\"span-nested-header-span\" class=\"span-nested-header-span\"><span>Nested Header</span></h2>"
620 );
621 }
622
623 #[test]
624 fn test_format_header_with_long_content() {
625 let header = format!("<h1>{}</h1>", "a".repeat(300));
626 let result =
627 format_header_with_id_class(&header, None, None);
628 assert!(result.is_ok());
629 }
630
631 #[test]
632 fn test_header_with_special_characters() {
633 let header = "<h3>Special & Header!</h3>";
634 let result =
635 format_header_with_id_class(header, None, None);
636 assert!(result.is_ok());
637 assert_eq!(
638 result.unwrap(),
639 "<h3 id=\"special-header\" class=\"special-header\">Special & Header!</h3>"
640 );
641 }
642 }
643
644 mod generate_table_of_contents_tests {
646 use super::*;
647
648 #[test]
649 fn test_valid_html_with_headers() {
650 let html = "<h1>Title</h1><h2>Subtitle</h2>";
651 let result = generate_table_of_contents(html);
652 let toc = result.expect("valid headers produce a TOC");
653 assert_eq!(
654 toc,
655 r#"<ul><li class="toc-h1"><a href="\#title">Title</a></li><li class="toc-h2"><a href="\#subtitle">Subtitle</a></li></ul>"#
656 );
657 }
658
659 #[test]
660 fn test_html_without_headers() {
661 let html = "<p>No headers here.</p>";
662 let result = generate_table_of_contents(html);
663 let toc =
664 result.expect("no headers still yields an empty TOC");
665 assert_eq!(toc, "<ul></ul>");
666 }
667
668 #[test]
669 fn test_empty_html() {
670 let html = "";
671 let result = generate_table_of_contents(html);
672 assert!(matches!(result, Err(HtmlError::InvalidInput(_))));
673 }
674
675 #[test]
676 fn test_large_html_content() {
677 let html = "<h1>Header</h1>".repeat(1000);
678 let result = generate_table_of_contents(&html);
679 assert!(result.is_ok());
680 }
681
682 #[test]
683 fn test_generate_table_of_contents_with_malformed_html() {
684 let html = "<h1>Title<h2>Subtitle";
685 let result = generate_table_of_contents(html);
686 assert!(result.is_ok());
687 assert_eq!(result.unwrap(), "<ul></ul>");
688 }
689
690 #[test]
691 fn test_generate_table_of_contents_with_attributes() {
692 let html = r#"<h1 class="header-class">Header</h1>"#;
693 let result = generate_table_of_contents(html);
694 assert!(result.is_ok());
695 assert_eq!(
696 result.unwrap(),
697 r#"<ul><li class="toc-h1"><a href="\#header">Header</a></li></ul>"#
698 );
699 }
700 }
701
702 mod aria_validation_tests {
704 use super::*;
705
706 #[test]
707 fn test_valid_aria_role_for_button() {
708 let html =
709 Html::parse_fragment("<button role='button'></button>");
710 let element = html
711 .select(&scraper::Selector::parse("button").unwrap())
712 .next()
713 .unwrap();
714 assert!(is_valid_aria_role("button", &element));
715 }
716
717 #[test]
718 fn test_invalid_aria_role_for_button() {
719 let html =
720 Html::parse_fragment("<button role='link'></button>");
721 let element = html
722 .select(&scraper::Selector::parse("button").unwrap())
723 .next()
724 .unwrap();
725 assert!(!is_valid_aria_role("link", &element));
726 }
727
728 #[test]
729 fn test_missing_required_aria_properties() {
730 let html =
731 Html::parse_fragment(r#"<div role="slider"></div>"#);
732 let element = html
733 .select(&scraper::Selector::parse("div").unwrap())
734 .next()
735 .unwrap();
736 let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
737 assert_eq!(
738 missing.unwrap(),
739 vec![
740 "aria-valuenow".to_string(),
741 "aria-valuemin".to_string(),
742 "aria-valuemax".to_string()
743 ]
744 );
745 }
746
747 #[test]
748 fn test_get_missing_required_aria_properties_valid_role() {
749 let html = Html::parse_fragment(
750 r#"<div role="slider" aria-valuenow="10" aria-valuemin="0" aria-valuemax="100"></div>"#,
751 );
752 let element = html
753 .select(&scraper::Selector::parse("div").unwrap())
754 .next()
755 .unwrap();
756 let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
757 assert!(missing.is_none());
758 }
759
760 #[test]
761 fn test_get_missing_required_aria_properties_unknown_role() {
762 let html =
763 Html::parse_fragment(r#"<div role="unknown"></div>"#);
764 let element = html
765 .select(&scraper::Selector::parse("div").unwrap())
766 .next()
767 .unwrap();
768 let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
769 assert!(missing.is_none());
770 }
771 }
772
773 mod utility_function_tests {
775 use super::*;
776
777 #[test]
778 fn test_generate_id() {
779 let content = "Test Header!";
780 let result = generate_id(content);
781 assert_eq!(result, "test-header");
782 }
783
784 #[test]
785 fn test_generate_id_with_special_characters() {
786 let content = "Header--with??special**chars";
787 let result = generate_id(content);
788 assert_eq!(result, "header-with-special-chars");
789 }
790
791 #[test]
792 fn test_generate_id_with_leading_trailing_whitespace() {
793 let content = " Test Header ";
794 let result = generate_id(content);
795 assert_eq!(result, "test-header");
796 }
797
798 #[test]
799 fn test_generate_id_with_numeric_content() {
800 let content = "12345";
801 let result = generate_id(content);
802 assert_eq!(result, "12345");
803 }
804
805 #[test]
806 fn test_is_valid_language_code() {
807 assert!(is_valid_language_code("en"));
808 assert!(is_valid_language_code("en-US"));
809 assert!(!is_valid_language_code("E"));
810 assert!(!is_valid_language_code("123"));
811 }
812
813 #[test]
814 fn test_is_valid_language_code_long_code() {
815 assert!(is_valid_language_code("en-US-variant-123"));
816 }
817
818 #[test]
819 fn test_is_valid_language_code_non_ascii() {
820 assert!(!is_valid_language_code("日本語"));
821 }
822
823 #[test]
825 fn test_extract_front_matter_empty_delimiters() {
826 let content = "------\n# Missing proper front matter";
827 let result = extract_front_matter(content);
828 assert!(matches!(
829 result,
830 Err(HtmlError::InvalidFrontMatterFormat(_))
831 ));
832 }
833
834 #[test]
835 fn test_extract_front_matter_large_content_valid_front_matter()
836 {
837 let large_content = format!(
838 "---\nkey: value\n---\n{}",
839 "Content".repeat(5000)
840 );
841 let result = extract_front_matter(&large_content);
842 assert!(result.is_ok());
843 }
844
845 #[test]
847 fn test_format_header_with_malformed_html() {
848 let header = "<h2 Missing closing>";
849 let result =
850 format_header_with_id_class(header, None, None);
851 assert!(matches!(
852 result,
853 Err(HtmlError::InvalidHeaderFormat(_))
854 ));
855 }
856
857 #[test]
858 fn test_format_header_with_inline_styles() {
859 let header =
860 r#"<h2 style="color: red;">Styled Header</h2>"#;
861 let result =
862 format_header_with_id_class(header, None, None);
863 assert!(result.is_ok());
864 assert_eq!(
865 result.unwrap(),
866 "<h2 id=\"styled-header\" class=\"styled-header\">Styled Header</h2>"
867 );
868 }
869
870 #[test]
872 fn test_toc_with_nested_headers() {
873 let html = "<div><h1>Outer</h1><h2>Inner</h2></div>";
874 let result = generate_table_of_contents(html);
875 assert!(result.is_ok());
876 assert_eq!(
877 result.unwrap(),
878 r#"<ul><li class="toc-h1"><a href="\#outer">Outer</a></li><li class="toc-h2"><a href="\#inner">Inner</a></li></ul>"#
879 );
880 }
881
882 #[test]
883 fn test_toc_with_malformed_and_valid_headers() {
884 let html = "<h1>Valid</h1><h2 Malformed>";
885 let result = generate_table_of_contents(html);
886 assert!(result.is_ok());
887 assert_eq!(
888 result.unwrap(),
889 r#"<ul><li class="toc-h1"><a href="\#valid">Valid</a></li></ul>"#
890 );
891 }
892
893 #[test]
895 fn test_unsupported_html_element() {
896 let html = Html::parse_fragment(
897 "<unsupported role='custom'></unsupported>",
898 );
899 let element = html
900 .select(
901 &scraper::Selector::parse("unsupported").unwrap(),
902 )
903 .next()
904 .unwrap();
905 assert!(!is_valid_aria_role("custom", &element));
906 }
907
908 #[test]
910 fn test_is_valid_language_code_with_mixed_case() {
911 assert!(!is_valid_language_code("eN-uS"));
912 assert!(!is_valid_language_code("En#Us"));
913 }
914
915 #[test]
917 fn test_generate_id_empty_content() {
918 let content = "";
919 let result = generate_id(content);
920 assert_eq!(result, "");
921 }
922
923 #[test]
924 fn test_generate_id_whitespace_content() {
925 let content = " ";
926 let result = generate_id(content);
927 assert_eq!(result, "");
928 }
929
930 #[test]
931 fn test_generate_id_symbols_only() {
932 let content = "!@#$%^&*()";
933 let result = generate_id(content);
934 assert_eq!(result, "");
935 }
936 }
937}