uuseg
uuseg copied to clipboard
Option to break between apostrophe and vowels
It might be interesting to have the option to use rule WB5a of UAX 29, for French and Italian texts.
I'm unsure yet if I want to provide this directly in uuseg or part of a larger language aware text processing framework.
For now you should be able to play with something like this:
module Uset = struct
include Set.Make (Uucp.Uchar)
let of_list us = List.fold_left (fun acc v -> add v acc) empty us
end
(* Segments words according to UAX 29 + WB5a on NFD
WB5 apostrophe ÷ vowels *)
type word_wb5a_state = Buf | Apos | Normal
type word_wb5a =
{ word : Uuseg.t;
vowels : Uset.t;
mutable state : word_wb5a_state;
mutable buf : [`Uchar of int]; }
let word_wb5a vowels =
let create () =
{ word = Uuseg.create `Word; vowels; state = Normal;
buf = `Uchar 0x0000 }
in
let copy s = { s with word = Uuseg.copy s.word } in
let add s v = match s.state with
| Buf ->
if v <> `Await then Uuseg.err_exp_await v;
s.state <- Normal; (s.buf :> Uuseg.ret)
| _ ->
match Uuseg.add s.word v with
| `Uchar u as v ->
begin match u with
| 0x0027 (* APOSTROPHE *)
| 0x2019 (* RIGHT SINGLE QUOTATION MARK *) ->
s.state <- Apos; v
| u when s.state = Apos && Uset.mem u s.vowels ->
s.state <- Buf; s.buf <- v; `Boundary
| u -> s.state <- Normal; v
end
| v -> v
in
Uuseg.custom ~name:"Reach_text.word_wb5a" ~create ~copy ~add ()
let french_vowels = Uset.of_list
[ 0x0061 (* a *); 0x0065 (* e *); 0x0069 (* i *); 0x006F (* o *);
0x0075 (* u *); 0x0079 (* y *);
0x0153 (* œ *); 0x00E6 (* æ *); 0x0048 (* h *);
0x0041 (* A *); 0x0045 (* E *); 0x0049 (* I *); 0x004F (* o *);
0x0055 (* U *); 0x0059 (* Y *);
0x0152 (* Œ *); 0x00E7 (* Æ *); 0x0068 (* H *); ]
let (french_words : Uuseg.boundary) = `Custom (word_wb5a french_vowels)