uuseg icon indicating copy to clipboard operation
uuseg copied to clipboard

Option to break between apostrophe and vowels

Open vouillon opened this issue 9 years ago • 1 comments

It might be interesting to have the option to use rule WB5a of UAX 29, for French and Italian texts.

vouillon avatar Jan 27 '16 09:01 vouillon

I'm unsure yet if I want to provide this directly in uuseg or part of a larger language aware text processing framework.

For now you should be able to play with something like this:

module Uset = struct
  include Set.Make (Uucp.Uchar)
  let of_list us = List.fold_left (fun acc v -> add v acc) empty us
end

(* Segments words according to UAX 29 + WB5a on NFD
   WB5 apostrophe ÷ vowels *)

type word_wb5a_state = Buf | Apos | Normal
type word_wb5a =
  { word : Uuseg.t;
    vowels : Uset.t;
    mutable state : word_wb5a_state;
    mutable buf : [`Uchar of int]; }

let word_wb5a vowels =
  let create () =
    { word = Uuseg.create `Word; vowels; state = Normal;
      buf = `Uchar 0x0000 }
  in
  let copy s = { s with word = Uuseg.copy s.word } in
  let add s v = match s.state with
  | Buf ->
      if v <> `Await then Uuseg.err_exp_await v;
      s.state <- Normal; (s.buf :> Uuseg.ret)
  | _ ->
      match Uuseg.add s.word v with
      | `Uchar u as v ->
          begin match u with
          | 0x0027 (* APOSTROPHE *)
          | 0x2019 (* RIGHT SINGLE QUOTATION MARK *) ->
              s.state <- Apos; v
          | u when s.state = Apos && Uset.mem u s.vowels ->
              s.state <- Buf; s.buf <- v; `Boundary
          | u -> s.state <- Normal; v
          end
      | v -> v
  in
  Uuseg.custom ~name:"Reach_text.word_wb5a" ~create ~copy ~add ()

let french_vowels = Uset.of_list
      [ 0x0061 (* a *); 0x0065 (* e *); 0x0069 (* i *); 0x006F (* o *);
        0x0075 (* u *); 0x0079 (* y *);
        0x0153 (* œ *); 0x00E6 (* æ *); 0x0048 (* h *);
        0x0041 (* A *); 0x0045 (* E *); 0x0049 (* I *); 0x004F (* o *);
        0x0055 (* U *); 0x0059 (* Y *);
        0x0152 (* Œ *); 0x00E7 (* Æ *); 0x0068 (* H *); ]

let (french_words : Uuseg.boundary) = `Custom (word_wb5a french_vowels)

dbuenzli avatar Jan 27 '16 09:01 dbuenzli