[Camomile-commits] camomile/internal uca_collator.ml,NONE,1.1
Status: Beta
Brought to you by:
yori
From: <yo...@us...> - 2002-06-27 00:08:41
|
Update of /cvsroot/camomile/camomile/internal In directory usw-pr-cvs1:/tmp/cvs-serv15816/internal Added Files: uca_collator.ml Log Message: New colltor design. Still not work. --- NEW FILE: uca_collator.ml --- (* $Id: uca_collator.ml,v 1.1 2002/06/27 00:08:39 yori Exp $ *) (* Copyright 2002 Yamagata Yoriyuki *) open Helpers let combined_class_tbl : Tbl32.bytes = Unidata.read_data "combined_class" let combined_class u = Tbl32.get_bytes combined_class_tbl (Uchar.uint_code u) type weight = int array type element = {is_variable : bool; weight : weight} type elements_tbl = (Uchar.t list * element list) list Tbl32.t type default_table = {def_rearrange : int list; def_elements_tbl : elements_tbl} let uca_default_table : default_table = Unidata.read_data "allkeys" type delta = elements_tbl type direction_type = Forward | Backward type variable_type = Blanked | Non_ignorable | Shifted type case_option_type = Lowercase_first | Uppercase_first let rec match_init l1 l2 = match l1, l2 with [], _ -> true | _, [] -> false | (x1 :: r1), (x2 :: r2) -> if x1 = x2 then match_init r1 r2 else false let rec list_split i l = if i = 0 then [], l else let l1, l2 = list_split (i - 1) (List.tl l) in ((List.hd l) :: l1), l2 class output_to_list = object val mutable buf = [] method put (u : Uchar.t) = buf <- u :: buf method close () = () method contents = List.rev buf end class swap rearrange (output : #Uchannel.output) = object val mutable prev = None method put (u : Uchar.t) = let n = Uchar.uint_code u in match prev with None -> if List.exists (fun n' -> n = n') rearrange then prev <- Some u else output#put u | Some u' -> output#put u; output#put u'; prev <- None method close () = match prev with None -> output#close () | Some u -> output#put u; output#close () end (* for proposed update. class remove_ignorable look_elem (output : #Uchannel.output) = object method put (u : Uchar.t) = try let es = look_elem u in match es with [([], [{is_variable = _; weight = [| 0; 0; 0|]}])] -> () | _ -> output#put u with Not_found -> output#put u method close () = output#close () end *) (* For proposed update algorithm let implicit_weight n = let m = n land 0xffff in if m = 0xffff || m = 0xfffe then [{is_variable = false; weight = [|0; 0; 0|]}] else if (n >= 0xd800 && n <= 0xdfff) || n < 0 || n > 0x10ffff then [{is_variable = false; weight = [|0; 0; 0|]}] else let base = if n >= 0x4e00 && n <= 0x9fff then 0xfb40 else (* cjk ideograph *) if n >= 0x3400 && n <= 0xfdbf || (* cjk ideograph A *) n >= 0x20000 && n <= 0x2a6df then 0xfb80 (* cjk ideograph B *) else 0xfbc0 in let a = base + (n lsr 15) in let b = (n land 0x7fff) lor 0x8000 in [ {is_variable = false; weight = [| a; 0x0020; 0x0002|]}; {is_variable = false; weight = [| b; 0; 0|]} ] *) let unassigned_weight n = let n1 = n lsr 16 in let n0 = n land 0xffff in [{is_variable = false; weight = [| (0xffe0 + n1); 0x0020; 0x0002 |]}; {is_variable = false; weight = [| n0; 0x000; 0x000 |]}] let implicit_weight u = let n = Uchar.uint_code u in let m = n land 0xffff in if m = 0xffff || m = 0xfffe then [{is_variable = false; weight = [|0; 0; 0|]}] else if (n >= 0xd800 && n <= 0xdfff) || n < 0 || n > 0x10ffff then [{is_variable = false; weight = [|0; 0; 0|]}] else match Ucategory.get u with Ucategory.Cn -> unassigned_weight n | Ucategory.Co -> unassigned_weight n | _ -> [ {is_variable = false; weight = [| n; 0x0020; 0x0002 |]} ] let swap_casing_tbl = {| 0x0; 0x1; 0x0008; 0x0009; 0x000A; 0x000B; 0x000C; 0x0007; 0x0002; 0x0003; 0x0004; 0x0005; 0x0006; 0x000D; 0x000E; 0x000F; 0x0010; 0x0011; 0x0012; 0x0013; 0x0014; 0x0015; 0x0016; 0x0017; 0x0018; 0x0019; 0x001A; 0x001B; 0x001D; 0x001C; 0x001E; 0x001F |} let swap_casing element = {is_variable = element.is_variable; weight = let w = Array.copy element.weight in w.(2) <- swap_casing_tbl.( w.(2) ); w} (* For porposed update let gen_weights_blanked elems = let rec loop1 elems a = match elems with [] -> a | e :: rest -> if e.is_variable then loop2 rest ([| 0; 0; 0 |] :: a) else loop1 rest (e.weight :: a) and loop2 elems a = match elems with [] -> a | e :: rest -> if e.weight.(0) = 0 then loop2 rest ([| 0; 0; 0 |] :: a) else loop1 elems a in List.rev (loop1 elems []) *) let gen_weights_blanked elems = let rec loop elems a = match elems with [] -> a | e :: rest -> if e.is_variable then loop rest ([| 0; 0; 0 |] :: a) else loop rest (e.weight :: a) in List.rev (loop elems []) let gen_weights_non_ignorable elems = List.map (fun e -> e.weight) elems (* For porposed update let gen_weights_shifted elems = let rec loop1 elems a = match elems with [] -> a | e :: rest -> if e.weight = [| 0; 0; 0 |] then loop1 rest ([| 0; 0; 0; 0 |] :: a) else if e.is_variable then loop2 rest ([| 0; 0; 0; e.weight.(0)|] :: a) else let w = Array.make 4 0xffff in Array.blit e.weight 0 w 0 3; loop1 rest (w :: a) and loop2 elems a = match elems with [] -> a | e :: rest -> if e.weight.(0) = 0 || e.weight.(1) = 0 then loop2 rest ([| 0; 0; 0; 0 |] :: a) else loop1 elems a in List.rev (loop1 elems []) *) let gen_weights_shifted elems = let rec loop elems a = match elems with [] -> a | e :: rest -> if e.weight = [| 0; 0; 0 |] then loop rest ([| 0; 0; 0; 0 |] :: a) else if e.is_variable then loop rest ([| 0; 0; 0; e.weight.(0)|] :: a) else let w = Array.make 4 0xffff in Array.blit e.weight 0 w 0 3; loop rest (w :: a) in List.rev (loop elems []) class ['s] uca ?(rearrange = uca_default_table.rearrange) ?(elements_tbl = uca_default_table.elements_tbl) ?(variable = Shifted) ?(directions = Array.make 4 Forward) ?(case_option = Lowercase_first) () = object (self) constraint 's = #Ustorage.cur #Ustorage.t val rearrange = rearrange val elements_tbl = elements_tbl val directions = directions val max_level = Array.length directions val case_option = case_option val variable = variable method set_option ?(rearrange = uca_default_table.rearrange) ?(variable = Shifted) ?(directions = Array.make 4 Forward) ?(case_option = Lowercase_first) () = {< rearrange = rearrange; directions = directions; max_level = Array.length directions; case_option = case_option; variable = variable }> method private look_elem_lower_first u = Tbl32.get elements_tbl (Uchar.uint_code u) method private look_elem_upper_first u = let proc us, es = us, (List.map swap_casing es) List.map proc (self#look_elem_lower_first u) method private look_elem = match case_option with Lowercase_first -> look_elem_lower_first | Uppercase_first -> look_elem_upper_first method private gen_weights = match variable with Blanked -> gen_weights_blanked | Non_ignorable -> gen_weights_non_ignorable | Shifted -> gen_weights_shifted method private gen_uchars (s : 's) = let listbuf = new output_to_list in let chan = new Unormalform.nfd_filter (new swap rearrange listbuf) in Uchannel.output_string chan s; chan#close (); listbuf#contents method private get_elems uchars = let u = List.hd uchars in let es = self#look_elem col_tbl u in let remain = List.tl uchars in let rec look_match i (us', _) = if match_init us' remain then let i' = List.length us' in if i' > i then i' else i else i in let i = List.fold_left look_match 0 es in let us, remain = list_split i remain in let rec loop2 us proced remain = (* proced is reverse-orderd. *) match remain with [] -> us, (List.rev proced) | u :: rest -> let x = combined_class u in let us', p' = let us' = us @ [u] in if (proced = [] || combined_class (List.hd proced) <> x) && (List.mem_assoc us' es) then us', proced else us, (u :: proced) in if x = 0 then us', (List.rev_append p' rest) (*rests are all blocked*) else loop2 us' p' rest in let us, remain = loop2 us [] remain in try (List.assoc us es), remain with Not_found -> match us with [] -> (implicit_weight u), remain | _ -> assert false method private gen_elems uchars = let rec loop uchars a = if uchars = [] then a else let elems, remain = self#get_elems uchars in loop remain (List.rev_append elems a) in List.rev (loop uchars []) method private gen_key level weights = let buf = Array.make level [] in let proc w = for i = 0 to level - 1 do if w.(i) <> 0 then buf.(i) <- w.(i) :: buf.(i) else () done in List.iter proc weights; let ret = ref [] in let proc i buf = let subkey0 = match dirs.(i) with Forward -> List.rev buf | Backward -> buf in let subkey = if i = 0 then subkey0 else 0 :: subkey0 in ret := !ret @ subkey in Array.iteri proc buf; !ret method sort_key_fast ?(prec = 0) (s : 's) = let uchars = self#gen_uchars s in let elems = self#gen_elems uchars in let weights = self#gen_weights elems in let level = if prec <= 0 || prec >= max_level then max_level else prec in self#gen_key level weights end |