Merge pull request #138 from nberth/keep-source-case

Preserve case of original source text
OCamlPro · Jan 22, 2024 · 0738b47 · 0738b47
2 parents 973be89 + 39e0cc2
commit 0738b47
Show file tree

Hide file tree

Showing 28 changed files with 1,673 additions and 1,629 deletions.
diff --git a/src/lsp/cobol_indent/indent_check.ml b/src/lsp/cobol_indent/indent_check.ml
@@ -548,12 +548,14 @@ and check_statement (text:text) state ifcheck  =
 
   (*TODO: find a better way to distinguish PERFORM(inline) and PERFORM_CLOSED(out-of-line)*)
   (*A bug here, TODO.md for details*)
-  | {payload = TextWord "PERFORM"; loc } :: _ :: {payload = TextWord "TIMES"; _} :: wordlist
-  | {payload = TextWord "PERFORM"; loc } :: {payload = TextWord ("UNTIL"|"VARYING"|"WITH"|"TEST"); _}
-    :: wordlist ->
+  | { payload = TextWord "PERFORM"; loc } :: _ ::
+    { payload = TextWord "TIMES"; _} :: wordlist
+  | { payload = TextWord "PERFORM"; loc } ::
+    { payload = TextWord ("UNTIL"|"VARYING"|"WITH"|"TEST"); _} :: wordlist ->
       handle_open_scope PERFORM loc wordlist state ifcheck
 
-  | {payload = TextWord "ELSE"; loc} :: {payload = TextWord "IF"; _} :: wordlist ->
+  | { payload = TextWord "ELSE"; loc } ::
+    { payload = TextWord "IF"; _ } :: wordlist ->
       let context = exp_scope_termination THEN context in
       begin match context with
       | (THEN, _) :: (IF, _) :: context' ->
@@ -1228,6 +1230,17 @@ and check_fun = function
 
 
 let check_indentation (text:text) (state:indent_state) =
+  let text =
+    (* Note: the above code assumes upper-cased text words, so that's what we
+       need to feed it. *)
+    let open Cobol_common.Srcloc.INFIX in
+    let open Cobol_preproc.Text.TYPES in
+    EzList.tail_map begin fun tw -> match ~&tw with
+      | TextWord w -> TextWord (String.uppercase_ascii w) &@<- tw
+      | CDirWord w -> CDirWord (String.uppercase_ascii w) &@<- tw
+      | _ -> tw
+    end text
+  in
   match state.range with
   | None ->
     check_fun state.scope text state true

diff --git a/src/lsp/cobol_indent/indent_keywords.ml b/src/lsp/cobol_indent/indent_keywords.ml
@@ -11,6 +11,13 @@
 (*                                                                        *)
 (**************************************************************************)
 
+(* Mappings to/from tokens and keyword strings are very redundant with what's
+   (auto-generated!) in `cobol_parser/text_keywords.ml` (and a little bit of
+   manual mapping from `combined_tokens` in `cobol_parser/text_tokenizer.ml`.
+
+   TODO: use the aforementioned associations instead of redefining them
+   below. *)
+
 open Indent_type
 
 (*data division *)
@@ -42,12 +49,11 @@ let keyword_stmt =
     "TERMINATE"; "UNLOCK"; "UNSTRING"; "USE"; "VALIDATE"; "WRITE";
     (*standard 1985*)
     "ALTER"; "DISABLE"; "ENABLE"; "PURGE"; "RECEIVE"; "SEND";]
-let keyword_stmt_tbl = Hashtbl.create 16
-let () =
-  List.iter (fun x ->
-    Hashtbl.add keyword_stmt_tbl x ())
-    keyword_stmt
-let is_statement = Hashtbl.mem keyword_stmt_tbl
+let is_statement =
+  let keyword_stmt_tbl = Hashtbl.create 16 in
+  List.iter (fun x -> Hashtbl.add keyword_stmt_tbl x ()) keyword_stmt;
+  fun word ->
+    Hashtbl.mem keyword_stmt_tbl word
 
 
 let str_proc_keyword_tbl = Hashtbl.create 16

diff --git a/src/lsp/cobol_lsp/lsp_lookup.ml b/src/lsp/cobol_lsp/lsp_lookup.ml
@@ -61,12 +61,6 @@ open TYPES
 
 (* --- *)
 
-let bare name : Cobol_ptree.qualname = Cobol_ptree.Name name
-let qual name : Cobol_ptree.qualname option -> Cobol_ptree.qualname = function
-  | None -> Cobol_ptree.Name name
-  | Some qn -> Cobol_ptree.Qual (name, qn)
-let simple_name : Cobol_ptree.qualname -> string = function
-  | Qual (n, _) | Name n -> ~&n
 let baseloc_of_qualname: Cobol_ptree.qualname -> srcloc = function
   | Name name
   | Qual (name, _) -> ~@name

diff --git a/src/lsp/cobol_parser/text_categorizer.mll b/src/lsp/cobol_parser/text_categorizer.mll
@@ -59,7 +59,8 @@ rule token = parse
   | (sign? digit* as n) (['.' ','] as sep) (digit+ as d)
       { Numeric (n, Some (sep, d, None)) }
 
-  | (sign? digit* as n) (['.' ','] as sep) (digit+ as d) 'E' (exponent as e)
+  | (sign? digit* as n) (['.' ','] as sep) (digit+ as d) ['E' 'e']
+      (exponent as e)
       { Numeric (n, Some (sep, d, Some e)) }
 
   | ident as s                                           (* 31 characters max *)

diff --git a/src/lsp/cobol_parser/text_lexer.ml b/src/lsp/cobol_parser/text_lexer.ml
@@ -154,7 +154,7 @@ let reserve_words lexer : Cobol_config.words_spec -> unit =
 (* --- *)
 
 let token_of_keyword { token_of_keyword; _ } s =
-  match Hashtbl.find token_of_keyword s with
+  match Hashtbl.find token_of_keyword (String.uppercase_ascii s) with
   | { token; enabled = true; reserved = true } -> token
   | _ -> raise Not_found
 

diff --git a/src/lsp/cobol_preproc/preproc_state.ml b/src/lsp/cobol_preproc/preproc_state.ml
@@ -59,7 +59,7 @@ let find_phrase first_word ?(prefix = `Same) text : _ result =
           Ok { prefix; phrase; suffix }
 
 (** [find_full_phrase words ~search_deep ~try_hard ~prefix text] looks for a
-    pharse comprised of all words in [words] and termiates with a period in
+    phrase comprised of all words in [words] that termiates with a period in
     [text].  If [prefix = `Rev] and upon success, the prefix is reveresed in the
     returned structure.
 

diff --git a/src/lsp/cobol_preproc/src_lexer.mll b/src/lsp/cobol_preproc/src_lexer.mll
@@ -582,7 +582,7 @@ and cdtoken = parse
       { cdtoken lexbuf }
 
   | (nonblank+ as s)
-      { CDTok (try Hashtbl.find cdtoken_of_keyword s
+      { CDTok (try Hashtbl.find cdtoken_of_keyword (String.uppercase_ascii s)
                with Not_found -> TEXT_WORD s) }
 
   | eof
@@ -599,7 +599,7 @@ and pptoken = parse
   | '.' { PPTok PERIOD }
 
   | (([^ '(' ')']+) as s)
-      { PPTok (try Hashtbl.find pptoken_of_keyword s
+      { PPTok (try Hashtbl.find pptoken_of_keyword (String.uppercase_ascii s)
                with Not_found -> TEXT_WORD s) }
 
   | eof

diff --git a/src/lsp/cobol_preproc/src_lexing.ml b/src/lsp/cobol_preproc/src_lexing.ml
@@ -224,9 +224,8 @@ let unexpected (type k) (kind: k c)
 
 (* --- *)
 
-(* Switch to upper-cased representation already: *)
 let textword s =
-  TextWord (String.uppercase_ascii ~&s) &@<- s
+  TextWord ~&s &@<- s
 
 let cdirword ?marker ~start_pos ~end_pos s state =
   let s = remove_blanks s in
@@ -238,7 +237,7 @@ let cdirword ?marker ~start_pos ~end_pos s state =
         s, start_pos
   in
   let loc = raw_loc ~start_pos ~end_pos state in
-  emit (CDirWord (String.uppercase_ascii s) &@ loc) state
+  emit (CDirWord s &@ loc) state
 
 
 let rev_pseudotext: text -> _ state -> pseudotext * _ state = fun text state ->
@@ -392,7 +391,7 @@ let unclosed_ebcdics =
   let symc = "[0-9][0-9, ]*" in                            (* symbolic EBCDIC *)
   let dblq = "\\([^\"]\\|\"" ^ symc ^ "\"\\|\"\"\\)*\"" ^ symc in
   let splq = "\\([^']\\|'" ^ symc ^ "\"\\|''\\)*'" ^ symc in
-  let re = Str.regexp ("^\\(" ^ dblq ^ "\\|" ^ splq ^ "\\)$") in
+  let re = Str.regexp_case_fold ("^\\(" ^ dblq ^ "\\|" ^ splq ^ "\\)$") in
   fun str -> Str.string_match re str 0
 
 let quoted_alphanum ?(fitting = Nominal) ~knd

diff --git a/src/lsp/cobol_preproc/text.ml b/src/lsp/cobol_preproc/text.ml
@@ -85,9 +85,13 @@ include FMT
 (* Various predicates on source-flagged character strings *)
 
 let textwordp t = match ~&t with TextWord _ -> true | _ -> false
-let textword_eqp ~eq:w t = match ~&t with TextWord t -> t = w | _ -> false
+let textword_eqp ~eq:w t = match ~&t with
+  | TextWord t -> String.uppercase_ascii t = String.uppercase_ascii w
+  | _ -> false
 let cdirp t = match ~&t with CDirWord _ -> true | _ -> false
-let cdir_eqp ~eq:w t = match ~&t with CDirWord t -> t = w | _ -> false
+let cdir_eqp ~eq:w t = match ~&t with
+  | CDirWord t -> String.uppercase_ascii t = String.uppercase_ascii w
+  | _ -> false
 
 (* Manipulating pseudo-words and text *)
 
@@ -108,7 +112,7 @@ let split_pseudo_string w =
           (PwText t &@ tloc) :: acc, wloc
       | Str.Delim d ->
           let dloc, wloc = split_loc wloc d in
-          (PwDelim (d, Str.regexp (Str.quote d)) &@ dloc) :: acc, wloc
+          (PwDelim (d, Str.regexp_case_fold (Str.quote d)) &@ dloc) :: acc, wloc
     end ([], ~@w)
   in
   List.rev pseudoword_items

diff --git a/src/lsp/cobol_preproc/text.mli b/src/lsp/cobol_preproc/text.mli
@@ -24,6 +24,8 @@ include module type of Text_types
    and type comment = Text_types.comment
    and type comments = Text_types.comments
 
+(** [*_eqp] functions below perform (ASCII) case-insensitive comparisons. *)
+
 val textwordp : text_word with_loc -> bool
 val textword_eqp : eq:string -> text_word with_loc -> bool
 

diff --git a/src/lsp/cobol_preproc/text_processor.ml b/src/lsp/cobol_preproc/text_processor.ml
@@ -57,7 +57,7 @@ let partial_word (type k) (req: k partial_word_request) words : (k, _) result =
 let partial_subst (k: partial_replacing) ({ payload = pat; _ } as repl_from) =
   { partial_subst_dir = k.repl_dir;
     partial_subst_len = String.length pat;
-    partial_subst_regexp = Str.regexp @@ match k.repl_dir with
+    partial_subst_regexp = Str.regexp_case_fold @@ match k.repl_dir with
       | Leading  when k.repl_strict -> "^" ^ Str.quote pat ^ "\\(.+\\)$"
       | Leading                     -> "^" ^ Str.quote pat ^ "\\(.*\\)$"
       | Trailing when k.repl_strict -> "^\\(.+\\)" ^ Str.quote pat ^ "$"