Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

move auto id into AST #301

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bin/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ let with_open_out fn f =
raise e

let process ?auto_identifiers ic oc =
let md = Omd.of_channel ic in
output_string oc (Omd.to_html ?auto_identifiers md)
let md = Omd.of_channel ?auto_identifiers ic in
output_string oc (Omd.to_html md)

let print_version () =
let version =
Expand Down
16 changes: 16 additions & 0 deletions src/ast_inline.ml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,19 @@ and 'attr link =
; destination : string
; title : string option
}

let to_plain_text t =
let buf = Buffer.create 1024 in
let rec go : _ inline -> unit = function
| Concat (_, l) -> List.iter go l
| Text (_, t) | Code (_, t) -> Buffer.add_string buf t
| Emph (_, i)
| Strong (_, i)
| Link (_, { label = i; _ })
| Image (_, { label = i; _ }) ->
go i
| Hard_break _ | Soft_break _ -> Buffer.add_char buf ' '
| Html _ -> ()
in
go t;
Buffer.contents buf
129 changes: 4 additions & 125 deletions src/html.ml
Original file line number Diff line number Diff line change
Expand Up @@ -85,94 +85,6 @@ let escape_uri s =
s;
Buffer.contents b

let trim_start_while p s =
let start = ref true in
let b = Buffer.create (String.length s) in
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> Buffer.add_string b s
| `Uchar u when p u && !start -> ()
| `Uchar u when !start ->
start := false;
Uutf.Buffer.add_utf_8 b u
| `Uchar u -> Uutf.Buffer.add_utf_8 b u)
()
s;
Buffer.contents b

let underscore = Uchar.of_char '_'
let hyphen = Uchar.of_char '-'
let period = Uchar.of_char '.'
let is_white_space = Uucp.White.is_white_space
let is_alphabetic = Uucp.Alpha.is_alphabetic
let is_hex_digit = Uucp.Num.is_hex_digit

module Identifiers : sig
type t

val empty : t

val touch : string -> t -> int * t
(** Bump the frequency count for the given string.
It returns the previous count (before bumping) *)
end = struct
module SMap = Map.Make (String)

type t = int SMap.t

let empty = SMap.empty
let count s t = match SMap.find_opt s t with None -> 0 | Some x -> x
let incr s t = SMap.add s (count s t + 1) t

let touch s t =
let count = count s t in
(count, incr s t)
end

(* Based on pandoc algorithm to derive id's.
See: https://pandoc.org/MANUAL.html#extension-auto_identifiers *)
let slugify s =
let s = trim_start_while (fun c -> not (is_alphabetic c)) s in
let length = String.length s in
let b = Buffer.create length in
let last_is_ws = ref false in
let add_to_buffer u =
if !last_is_ws = true then begin
Uutf.Buffer.add_utf_8 b (Uchar.of_char '-');
last_is_ws := false
end;
Uutf.Buffer.add_utf_8 b u
in
let fold () _ = function
| `Malformed _ -> add_to_buffer Uutf.u_rep
| `Uchar u when is_white_space u && not !last_is_ws -> last_is_ws := true
| `Uchar u when is_white_space u && !last_is_ws -> ()
| `Uchar u ->
(if is_alphabetic u || is_hex_digit u then
match Uucp.Case.Map.to_lower u with
| `Self -> add_to_buffer u
| `Uchars us -> List.iter add_to_buffer us);
if u = underscore || u = hyphen || u = period then add_to_buffer u
in
Uutf.String.fold_utf_8 fold () s;
Buffer.contents b

let to_plain_text t =
let buf = Buffer.create 1024 in
let rec go : _ inline -> unit = function
| Concat (_, l) -> List.iter go l
| Text (_, t) | Code (_, t) -> Buffer.add_string buf t
| Emph (_, i)
| Strong (_, i)
| Link (_, { label = i; _ })
| Image (_, { label = i; _ }) ->
go i
| Hard_break _ | Soft_break _ -> Buffer.add_char buf ' '
| Html _ -> ()
in
go t;
Buffer.contents buf

let nl = Raw "\n"

let rec url label destination title attrs =
Expand Down Expand Up @@ -249,13 +161,9 @@ let table_body headers rows =
row)))
rows))

let rec block ~auto_identifiers = function
let rec block = function
| Blockquote (attr, q) ->
elt
Block
"blockquote"
attr
(Some (concat nl (concat_map (block ~auto_identifiers) q)))
elt Block "blockquote" attr (Some (concat nl (concat_map block q)))
| Paragraph (attr, md) -> elt Block "p" attr (Some (inline md))
| List (attr, ty, sp, bl) ->
let name = match ty with Ordered _ -> "ol" | Bullet _ -> "ul" in
Expand All @@ -268,7 +176,7 @@ let rec block ~auto_identifiers = function
let block' t =
match (t, sp) with
| Paragraph (_, t), Tight -> concat (inline t) nl
| _ -> block ~auto_identifiers t
| _ -> block t
in
let nl = if sp = Tight then Null else nl in
elt Block "li" [] (Some (concat nl (concat_map block' t)))
Expand Down Expand Up @@ -311,36 +219,7 @@ let rec block ~auto_identifiers = function
attr
(Some (concat (table_header headers) (table_body headers rows)))

let of_doc ?(auto_identifiers = true) doc =
let identifiers = Identifiers.empty in
let f identifiers = function
| Heading (attr, level, text) ->
let attr, identifiers =
if (not auto_identifiers) || List.mem_assoc "id" attr then
(attr, identifiers)
else
let id = slugify (to_plain_text text) in
(* Default identifier if empty. It matches what pandoc does. *)
let id = if id = "" then "section" else id in
let count, identifiers = Identifiers.touch id identifiers in
let id =
if count = 0 then id else Printf.sprintf "%s-%i" id count
in
(("id", id) :: attr, identifiers)
in
(Heading (attr, level, text), identifiers)
| _ as c -> (c, identifiers)
in
let html, _ =
List.fold_left
(fun (accu, ids) x ->
let x', ids = f ids x in
let el = concat accu (block ~auto_identifiers x') in
(el, ids))
(Null, identifiers)
doc
in
html
let of_doc doc = concat_map block doc

let to_string t =
let buf = Buffer.create 1024 in
Expand Down
2 changes: 1 addition & 1 deletion src/html.mli
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ type t =
| Concat of t * t

val htmlentities : string -> string
val of_doc : ?auto_identifiers:bool -> attributes block list -> t
val of_doc : attributes block list -> t
val to_string : t -> string
11 changes: 11 additions & 0 deletions src/identifiers.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module SMap = Map.Make (String)

type t = int SMap.t

let empty = SMap.empty
let count s t = match SMap.find_opt s t with None -> 0 | Some x -> x
let incr s t = SMap.add s (count s t + 1) t

let touch s t =
let count = count s t in
(count, incr s t)
7 changes: 7 additions & 0 deletions src/identifiers.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
type t

val empty : t

val touch : string -> t -> int * t
(** Bump the frequency count for the given string.
It returns the previous count (before bumping) *)
88 changes: 82 additions & 6 deletions src/omd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,96 @@ let toc = Toc.toc

let parse_inline defs s = Parser.inline defs (Parser.P.of_string s)

let parse_inlines (md, defs) : doc =
let trim_start_while p s =
let start = ref true in
let b = Buffer.create (String.length s) in
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> Buffer.add_string b s
| `Uchar u when p u && !start -> ()
| `Uchar u when !start ->
start := false;
Uutf.Buffer.add_utf_8 b u
| `Uchar u -> Uutf.Buffer.add_utf_8 b u)
()
s;
Buffer.contents b

let underscore = Uchar.of_char '_'
let hyphen = Uchar.of_char '-'
let period = Uchar.of_char '.'
let is_white_space = Uucp.White.is_white_space
let is_alphabetic = Uucp.Alpha.is_alphabetic
let is_hex_digit = Uucp.Num.is_hex_digit

let slugify s =
let s = trim_start_while (fun c -> not (is_alphabetic c)) s in
let length = String.length s in
let b = Buffer.create length in
let last_is_ws = ref false in
let add_to_buffer u =
if !last_is_ws = true then begin
Uutf.Buffer.add_utf_8 b (Uchar.of_char '-');
last_is_ws := false
end;
Uutf.Buffer.add_utf_8 b u
in
let fold () _ = function
| `Malformed _ -> add_to_buffer Uutf.u_rep
| `Uchar u when is_white_space u && not !last_is_ws -> last_is_ws := true
| `Uchar u when is_white_space u && !last_is_ws -> ()
| `Uchar u ->
(if is_alphabetic u || is_hex_digit u then
match Uucp.Case.Map.to_lower u with
| `Self -> add_to_buffer u
| `Uchars us -> List.iter add_to_buffer us);
if u = underscore || u = hyphen || u = period then add_to_buffer u
in
Uutf.String.fold_utf_8 fold () s;
Buffer.contents b

let parse_inlines ~auto_identifiers (md, defs) : doc =
let defs =
let f (def : attributes Parser.link_def) =
{ def with label = Parser.normalize def.label }
in
List.map f defs
in
List.map (Ast_block.Mapper.map (parse_inline defs)) md
let identifiers = Identifiers.empty in
let f identifiers = function
| Ast_block.WithInline.Heading (attr, level, text) ->
let attr, identifiers =
if (not auto_identifiers) || List.mem_assoc "id" attr then
(attr, identifiers)
else
let id = slugify (Ast_inline.to_plain_text text) in
(* Default identifier if empty. It matches what pandoc does. *)
let id = if id = "" then "section" else id in
let count, identifiers = Identifiers.touch id identifiers in
let id =
if count = 0 then id else Printf.sprintf "%s-%i" id count
in
(("id", id) :: attr, identifiers)
in
(Ast_block.WithInline.Heading (attr, level, text), identifiers)
| _ as c -> (c, identifiers)
in
List.fold_left
(fun (accu, ids) src ->
let dst, ids = src |> Ast_block.Mapper.map (parse_inline defs) |> f ids in
(dst :: accu, ids))
([], identifiers)
md
|> fst
|> List.rev

let escape_html_entities = Html.htmlentities
let of_channel ic : doc = parse_inlines (Block_parser.Pre.of_channel ic)
let of_string s = parse_inlines (Block_parser.Pre.of_string s)

let to_html ?auto_identifiers doc =
Html.to_string (Html.of_doc ?auto_identifiers doc)
let of_channel ?(auto_identifiers = true) ic : doc =
parse_inlines ~auto_identifiers (Block_parser.Pre.of_channel ic)

let of_string ?(auto_identifiers = true) s =
parse_inlines ~auto_identifiers (Block_parser.Pre.of_string s)

let to_html doc = Html.to_string (Html.of_doc doc)
let to_sexp ast = Format.asprintf "@[%a@]@." Sexp.print (Sexp.create ast)
6 changes: 3 additions & 3 deletions src/omd.mli
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ val escape_html_entities : string -> string

(** {2 Converting to and from documents} *)

val of_channel : in_channel -> doc
val of_string : string -> doc
val to_html : ?auto_identifiers:bool -> doc -> string
val of_channel : ?auto_identifiers:bool -> in_channel -> doc
val of_string : ?auto_identifiers:bool -> string -> doc
val to_html : doc -> string
val to_sexp : doc -> string
4 changes: 2 additions & 2 deletions tests/expect_tests.ml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ let%expect_test "construct inline elements" =

let%expect_test "construct headings" =
show Omd.Ctor.[ h 1 ~attrs:[ ("class", "my-class") ] [ txt "Heading 1" ] ];
[%expect {| <h1 id="heading-1" class="my-class">Heading 1</h1> |}];
[%expect {| <h1 class="my-class">Heading 1</h1> |}];
show Omd.Ctor.[ h 6 [ txt "Heading 6"; em "with emphasis!" ] ];
[%expect
{| <h6 id="heading-6with-emphasis">Heading 6<em>with emphasis!</em></h6> |}]
{| <h6>Heading 6<em>with emphasis!</em></h6> |}]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's a side effect of the new implementation. Since we now add the auto-ids at the time we parse the document, there's no way to have them auto-included here.


let%expect_test "construct lists" =
show
Expand Down
2 changes: 1 addition & 1 deletion tests/omd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ let with_open_in fn f =
let () =
with_open_in Sys.argv.(1) @@ fun ic ->
print_string
(normalize_html (Omd.to_html ~auto_identifiers:false (Omd.of_channel ic)))
(normalize_html (Omd.to_html (Omd.of_channel ~auto_identifiers:false ic)))