diff --git a/bin/main.ml b/bin/main.ml index 5db8769c..b1ed837c 100644 --- a/bin/main.ml +++ b/bin/main.ml @@ -19,8 +19,8 @@ let with_open_out fn f = raise e let process ?auto_identifiers ic oc = - let md = Omd.of_channel ic in - output_string oc (Omd.to_html ?auto_identifiers md) + let md = Omd.of_channel ?auto_identifiers ic in + output_string oc (Omd.to_html md) let print_version () = let version = diff --git a/src/ast_inline.ml b/src/ast_inline.ml index a74c8de9..b54cd80b 100644 --- a/src/ast_inline.ml +++ b/src/ast_inline.ml @@ -22,3 +22,19 @@ and 'attr link = ; destination : string ; title : string option } + +let to_plain_text t = + let buf = Buffer.create 1024 in + let rec go : _ inline -> unit = function + | Concat (_, l) -> List.iter go l + | Text (_, t) | Code (_, t) -> Buffer.add_string buf t + | Emph (_, i) + | Strong (_, i) + | Link (_, { label = i; _ }) + | Image (_, { label = i; _ }) -> + go i + | Hard_break _ | Soft_break _ -> Buffer.add_char buf ' ' + | Html _ -> () + in + go t; + Buffer.contents buf diff --git a/src/html.ml b/src/html.ml index dd817d34..d7a3df14 100644 --- a/src/html.ml +++ b/src/html.ml @@ -85,94 +85,6 @@ let escape_uri s = s; Buffer.contents b -let trim_start_while p s = - let start = ref true in - let b = Buffer.create (String.length s) in - Uutf.String.fold_utf_8 - (fun () _ -> function - | `Malformed _ -> Buffer.add_string b s - | `Uchar u when p u && !start -> () - | `Uchar u when !start -> - start := false; - Uutf.Buffer.add_utf_8 b u - | `Uchar u -> Uutf.Buffer.add_utf_8 b u) - () - s; - Buffer.contents b - -let underscore = Uchar.of_char '_' -let hyphen = Uchar.of_char '-' -let period = Uchar.of_char '.' -let is_white_space = Uucp.White.is_white_space -let is_alphabetic = Uucp.Alpha.is_alphabetic -let is_hex_digit = Uucp.Num.is_hex_digit - -module Identifiers : sig - type t - - val empty : t - - val touch : string -> t -> int * t - (** Bump the frequency count for the given string. - It returns the previous count (before bumping) *) -end = struct - module SMap = Map.Make (String) - - type t = int SMap.t - - let empty = SMap.empty - let count s t = match SMap.find_opt s t with None -> 0 | Some x -> x - let incr s t = SMap.add s (count s t + 1) t - - let touch s t = - let count = count s t in - (count, incr s t) -end - -(* Based on pandoc algorithm to derive id's. - See: https://pandoc.org/MANUAL.html#extension-auto_identifiers *) -let slugify s = - let s = trim_start_while (fun c -> not (is_alphabetic c)) s in - let length = String.length s in - let b = Buffer.create length in - let last_is_ws = ref false in - let add_to_buffer u = - if !last_is_ws = true then begin - Uutf.Buffer.add_utf_8 b (Uchar.of_char '-'); - last_is_ws := false - end; - Uutf.Buffer.add_utf_8 b u - in - let fold () _ = function - | `Malformed _ -> add_to_buffer Uutf.u_rep - | `Uchar u when is_white_space u && not !last_is_ws -> last_is_ws := true - | `Uchar u when is_white_space u && !last_is_ws -> () - | `Uchar u -> - (if is_alphabetic u || is_hex_digit u then - match Uucp.Case.Map.to_lower u with - | `Self -> add_to_buffer u - | `Uchars us -> List.iter add_to_buffer us); - if u = underscore || u = hyphen || u = period then add_to_buffer u - in - Uutf.String.fold_utf_8 fold () s; - Buffer.contents b - -let to_plain_text t = - let buf = Buffer.create 1024 in - let rec go : _ inline -> unit = function - | Concat (_, l) -> List.iter go l - | Text (_, t) | Code (_, t) -> Buffer.add_string buf t - | Emph (_, i) - | Strong (_, i) - | Link (_, { label = i; _ }) - | Image (_, { label = i; _ }) -> - go i - | Hard_break _ | Soft_break _ -> Buffer.add_char buf ' ' - | Html _ -> () - in - go t; - Buffer.contents buf - let nl = Raw "\n" let rec url label destination title attrs = @@ -249,13 +161,9 @@ let table_body headers rows = row))) rows)) -let rec block ~auto_identifiers = function +let rec block = function | Blockquote (attr, q) -> - elt - Block - "blockquote" - attr - (Some (concat nl (concat_map (block ~auto_identifiers) q))) + elt Block "blockquote" attr (Some (concat nl (concat_map block q))) | Paragraph (attr, md) -> elt Block "p" attr (Some (inline md)) | List (attr, ty, sp, bl) -> let name = match ty with Ordered _ -> "ol" | Bullet _ -> "ul" in @@ -268,7 +176,7 @@ let rec block ~auto_identifiers = function let block' t = match (t, sp) with | Paragraph (_, t), Tight -> concat (inline t) nl - | _ -> block ~auto_identifiers t + | _ -> block t in let nl = if sp = Tight then Null else nl in elt Block "li" [] (Some (concat nl (concat_map block' t))) @@ -311,36 +219,7 @@ let rec block ~auto_identifiers = function attr (Some (concat (table_header headers) (table_body headers rows))) -let of_doc ?(auto_identifiers = true) doc = - let identifiers = Identifiers.empty in - let f identifiers = function - | Heading (attr, level, text) -> - let attr, identifiers = - if (not auto_identifiers) || List.mem_assoc "id" attr then - (attr, identifiers) - else - let id = slugify (to_plain_text text) in - (* Default identifier if empty. It matches what pandoc does. *) - let id = if id = "" then "section" else id in - let count, identifiers = Identifiers.touch id identifiers in - let id = - if count = 0 then id else Printf.sprintf "%s-%i" id count - in - (("id", id) :: attr, identifiers) - in - (Heading (attr, level, text), identifiers) - | _ as c -> (c, identifiers) - in - let html, _ = - List.fold_left - (fun (accu, ids) x -> - let x', ids = f ids x in - let el = concat accu (block ~auto_identifiers x') in - (el, ids)) - (Null, identifiers) - doc - in - html +let of_doc doc = concat_map block doc let to_string t = let buf = Buffer.create 1024 in diff --git a/src/html.mli b/src/html.mli index 8de86832..342e9f9b 100644 --- a/src/html.mli +++ b/src/html.mli @@ -13,5 +13,5 @@ type t = | Concat of t * t val htmlentities : string -> string -val of_doc : ?auto_identifiers:bool -> attributes block list -> t +val of_doc : attributes block list -> t val to_string : t -> string diff --git a/src/identifiers.ml b/src/identifiers.ml new file mode 100644 index 00000000..47102de3 --- /dev/null +++ b/src/identifiers.ml @@ -0,0 +1,11 @@ +module SMap = Map.Make (String) + +type t = int SMap.t + +let empty = SMap.empty +let count s t = match SMap.find_opt s t with None -> 0 | Some x -> x +let incr s t = SMap.add s (count s t + 1) t + +let touch s t = + let count = count s t in + (count, incr s t) diff --git a/src/identifiers.mli b/src/identifiers.mli new file mode 100644 index 00000000..3572e6fc --- /dev/null +++ b/src/identifiers.mli @@ -0,0 +1,7 @@ +type t + +val empty : t + +val touch : string -> t -> int * t +(** Bump the frequency count for the given string. + It returns the previous count (before bumping) *) diff --git a/src/omd.ml b/src/omd.ml index d00f26bf..de183256 100644 --- a/src/omd.ml +++ b/src/omd.ml @@ -15,20 +15,96 @@ let toc = Toc.toc let parse_inline defs s = Parser.inline defs (Parser.P.of_string s) -let parse_inlines (md, defs) : doc = +let trim_start_while p s = + let start = ref true in + let b = Buffer.create (String.length s) in + Uutf.String.fold_utf_8 + (fun () _ -> function + | `Malformed _ -> Buffer.add_string b s + | `Uchar u when p u && !start -> () + | `Uchar u when !start -> + start := false; + Uutf.Buffer.add_utf_8 b u + | `Uchar u -> Uutf.Buffer.add_utf_8 b u) + () + s; + Buffer.contents b + +let underscore = Uchar.of_char '_' +let hyphen = Uchar.of_char '-' +let period = Uchar.of_char '.' +let is_white_space = Uucp.White.is_white_space +let is_alphabetic = Uucp.Alpha.is_alphabetic +let is_hex_digit = Uucp.Num.is_hex_digit + +let slugify s = + let s = trim_start_while (fun c -> not (is_alphabetic c)) s in + let length = String.length s in + let b = Buffer.create length in + let last_is_ws = ref false in + let add_to_buffer u = + if !last_is_ws = true then begin + Uutf.Buffer.add_utf_8 b (Uchar.of_char '-'); + last_is_ws := false + end; + Uutf.Buffer.add_utf_8 b u + in + let fold () _ = function + | `Malformed _ -> add_to_buffer Uutf.u_rep + | `Uchar u when is_white_space u && not !last_is_ws -> last_is_ws := true + | `Uchar u when is_white_space u && !last_is_ws -> () + | `Uchar u -> + (if is_alphabetic u || is_hex_digit u then + match Uucp.Case.Map.to_lower u with + | `Self -> add_to_buffer u + | `Uchars us -> List.iter add_to_buffer us); + if u = underscore || u = hyphen || u = period then add_to_buffer u + in + Uutf.String.fold_utf_8 fold () s; + Buffer.contents b + +let parse_inlines ~auto_identifiers (md, defs) : doc = let defs = let f (def : attributes Parser.link_def) = { def with label = Parser.normalize def.label } in List.map f defs in - List.map (Ast_block.Mapper.map (parse_inline defs)) md + let identifiers = Identifiers.empty in + let f identifiers = function + | Ast_block.WithInline.Heading (attr, level, text) -> + let attr, identifiers = + if (not auto_identifiers) || List.mem_assoc "id" attr then + (attr, identifiers) + else + let id = slugify (Ast_inline.to_plain_text text) in + (* Default identifier if empty. It matches what pandoc does. *) + let id = if id = "" then "section" else id in + let count, identifiers = Identifiers.touch id identifiers in + let id = + if count = 0 then id else Printf.sprintf "%s-%i" id count + in + (("id", id) :: attr, identifiers) + in + (Ast_block.WithInline.Heading (attr, level, text), identifiers) + | _ as c -> (c, identifiers) + in + List.fold_left + (fun (accu, ids) src -> + let dst, ids = src |> Ast_block.Mapper.map (parse_inline defs) |> f ids in + (dst :: accu, ids)) + ([], identifiers) + md + |> fst + |> List.rev let escape_html_entities = Html.htmlentities -let of_channel ic : doc = parse_inlines (Block_parser.Pre.of_channel ic) -let of_string s = parse_inlines (Block_parser.Pre.of_string s) -let to_html ?auto_identifiers doc = - Html.to_string (Html.of_doc ?auto_identifiers doc) +let of_channel ?(auto_identifiers = true) ic : doc = + parse_inlines ~auto_identifiers (Block_parser.Pre.of_channel ic) + +let of_string ?(auto_identifiers = true) s = + parse_inlines ~auto_identifiers (Block_parser.Pre.of_string s) +let to_html doc = Html.to_string (Html.of_doc doc) let to_sexp ast = Format.asprintf "@[%a@]@." Sexp.print (Sexp.create ast) diff --git a/src/omd.mli b/src/omd.mli index 87329fc9..f2010b4d 100644 --- a/src/omd.mli +++ b/src/omd.mli @@ -26,7 +26,7 @@ val escape_html_entities : string -> string (** {2 Converting to and from documents} *) -val of_channel : in_channel -> doc -val of_string : string -> doc -val to_html : ?auto_identifiers:bool -> doc -> string +val of_channel : ?auto_identifiers:bool -> in_channel -> doc +val of_string : ?auto_identifiers:bool -> string -> doc +val to_html : doc -> string val to_sexp : doc -> string diff --git a/tests/expect_tests.ml b/tests/expect_tests.ml index 4940fae0..a7f50961 100644 --- a/tests/expect_tests.ml +++ b/tests/expect_tests.ml @@ -34,10 +34,10 @@ let%expect_test "construct inline elements" = let%expect_test "construct headings" = show Omd.Ctor.[ h 1 ~attrs:[ ("class", "my-class") ] [ txt "Heading 1" ] ]; - [%expect {|

Heading 1

|}]; + [%expect {|

Heading 1

|}]; show Omd.Ctor.[ h 6 [ txt "Heading 6"; em "with emphasis!" ] ]; [%expect - {|
Heading 6with emphasis!
|}] + {|
Heading 6with emphasis!
|}] let%expect_test "construct lists" = show diff --git a/tests/omd.ml b/tests/omd.ml index efef26f8..da47f0b0 100644 --- a/tests/omd.ml +++ b/tests/omd.ml @@ -20,4 +20,4 @@ let with_open_in fn f = let () = with_open_in Sys.argv.(1) @@ fun ic -> print_string - (normalize_html (Omd.to_html ~auto_identifiers:false (Omd.of_channel ic))) + (normalize_html (Omd.to_html (Omd.of_channel ~auto_identifiers:false ic)))