@@ -91,7 +91,7 @@ public abstract class CollationMapper implements Serializable {
9191 * using utf8mb4), 'b') = 'ab' COLLATE <collation>;} returns 1. TODO(vardhanvthigle): Check this
9292 * behavior for PG and other databases.
9393 */
94- public abstract ImmutableSet <Character > emptyCharacters ();
94+ public abstract ImmutableSet <Integer > emptyCharacters ();
9595
9696 /**
9797 * Space Characters. MySQL ignores trailing space characters in comparisons for PAD space
@@ -100,11 +100,13 @@ public abstract class CollationMapper implements Serializable {
100100 * (UNHEX(C2H0)) when the collation is Pad Space. These have same behavior to ascii space as far
101101 * as trailing or non-trailing comparison is concerned.
102102 */
103- public abstract ImmutableSet <Character > spaceCharacters ();
103+ public abstract ImmutableSet <Integer > spaceCharacters ();
104104
105105 @ Memoized
106106 String allSpaceCharacters () {
107- return this .spaceCharacters ().stream ().map (String ::valueOf ).collect (Collectors .joining ("" ));
107+ return this .spaceCharacters ().stream ()
108+ .map (c -> new String (Character .toChars (c )))
109+ .collect (Collectors .joining ("" ));
108110 }
109111
110112 @ Memoized
@@ -114,7 +116,9 @@ String emptyReplacePattern() {
114116 }
115117 return "["
116118 + Pattern .quote (
117- this .emptyCharacters ().stream ().map (String ::valueOf ).collect (Collectors .joining ("" )))
119+ this .emptyCharacters ().stream ()
120+ .map (c -> new String (Character .toChars (c )))
121+ .collect (Collectors .joining ("" )))
118122 + "]" ;
119123 }
120124
@@ -160,11 +164,16 @@ public BigInteger mapString(@Nullable String element, int lengthToPad) {
160164 }
161165
162166 // Convert the string to BigInteger.
163- for (int index = 0 ; index < element .length (); index ++) {
164- Character c = element .charAt (index );
167+ int length = element .codePointCount (0 , element .length ());
168+ int codePointIndex = 0 ;
169+ for (int index = 0 ; index < element .length (); ) {
170+ int codepoint = element .codePointAt (index );
171+ boolean isLast = (codePointIndex == length - 1 );
165172 ret =
166- ret .multiply (BigInteger .valueOf (getCharsetSize (index == (element .length () - 1 ))))
167- .add (BigInteger .valueOf (getOrdinalPosition (c , index == (element .length () - 1 ))));
173+ ret .multiply (BigInteger .valueOf (getCharsetSize (isLast )))
174+ .add (BigInteger .valueOf (getOrdinalPosition (codepoint , isLast )));
175+ index += Character .charCount (codepoint );
176+ codePointIndex ++;
168177 }
169178 for (int index = element .length (); index < lengthToPad ; index ++) {
170179 ret = ret .multiply (BigInteger .valueOf (getCharsetSize (index == (element .length () - 1 ))));
@@ -199,16 +208,16 @@ public String unMapString(BigInteger element) {
199208
200209 // Base Case that the string just represents single character
201210 if (element .equals (BigInteger .ZERO )) {
202- char c = getCharacterFromPosition (element .longValue (), true );
203- return String . valueOf ( c );
211+ int c = getCharacterFromPosition (element .longValue (), true );
212+ return new String ( Character . toChars ( c ) );
204213 }
205214
206215 while (!element .equals (BigInteger .ZERO )) {
207216 long charsetSize = getCharsetSize (index == 0 );
208217
209218 BigInteger reminder = element .mod (BigInteger .valueOf (charsetSize ));
210- char c = getCharacterFromPosition (reminder .longValue (), (index == 0 ));
211- word .append (c );
219+ int c = getCharacterFromPosition (reminder .longValue (), (index == 0 ));
220+ word .appendCodePoint (c );
212221
213222 element = element .divide (BigInteger .valueOf (charsetSize ));
214223 index ++;
@@ -318,11 +327,11 @@ private static CollationMapper fromResultSetWithWeights(
318327 if (charsetChar == null || charsetChar .isEmpty ()) {
319328 continue ;
320329 }
330+ int c = charsetChar .codePointAt (0 );
321331 Preconditions .checkArgument (
322- charsetChar .length () == 1 ,
332+ charsetChar .length () == Character . charCount ( c ) ,
323333 "Expected single character from collation query, got: %s" ,
324334 charsetChar );
325- char c = charsetChar .charAt (0 );
326335 byte [] wNt = rs .getBytes (CollationsOrderQueryColumns .WEIGHT_NON_TRAILING_COL );
327336 byte [] wT = rs .getBytes (CollationsOrderQueryColumns .WEIGHT_TRAILING_COL );
328337 boolean isEmpty = rs .getBoolean (CollationsOrderQueryColumns .IS_EMPTY_COL );
@@ -332,7 +341,7 @@ private static CollationMapper fromResultSetWithWeights(
332341 if (wNt == null && !isEmpty ) {
333342 logger .warn (
334343 "Skipping character codepoint={} for {} because weight_non_trailing is NULL" ,
335- ( int ) c ,
344+ c ,
336345 collationReference );
337346 continue ;
338347 }
@@ -343,19 +352,19 @@ private static CollationMapper fromResultSetWithWeights(
343352
344353 // Phase 2a: build all-positions equivalence groups from weight_non_trailing.
345354 // TreeMap gives groups in sorted weight order (= collation rank order).
346- TreeMap <String , TreeMap <Integer , Character >> ntGroups = new TreeMap <>(weightKeyOrder );
355+ TreeMap <String , TreeMap <Integer , Integer >> ntGroups = new TreeMap <>(weightKeyOrder );
347356 for (WeightRow row : rows ) {
348357 if (!row .isEmpty ()) {
349- ntGroups .computeIfAbsent (row .weightNt (), k -> new TreeMap <>()).put (( int ) row .c (), row .c ());
358+ ntGroups .computeIfAbsent (row .weightNt (), k -> new TreeMap <>()).put (row .c (), row .c ());
350359 }
351360 }
352361 // Assign dense ranks and resolve equivalent characters (min codepoint per group).
353- Map <Character , Long > ntRank = new HashMap <>();
354- Map <Character , Character > ntEquiv = new HashMap <>();
362+ Map <Integer , Long > ntRank = new HashMap <>();
363+ Map <Integer , Integer > ntEquiv = new HashMap <>();
355364 long rank = 0 ;
356- for (TreeMap <Integer , Character > group : ntGroups .values ()) {
357- char equiv = group .firstEntry ().getValue (); // smallest codepoint = canonical equivalent
358- for (Character c : group .values ()) {
365+ for (TreeMap <Integer , Integer > group : ntGroups .values ()) {
366+ int equiv = group .firstEntry ().getValue (); // smallest codepoint = canonical equivalent
367+ for (Integer c : group .values ()) {
359368 ntRank .put (c , rank );
360369 ntEquiv .put (c , equiv );
361370 }
@@ -365,18 +374,18 @@ private static CollationMapper fromResultSetWithWeights(
365374 // Phase 2b: build PAD-SPACE trailing equivalence groups from weight_trailing.
366375 // Space characters (is_space=true) are intentionally excluded; they are tracked in
367376 // spaceCharacters and stripped before the trailing index is consulted.
368- TreeMap <String , TreeMap <Integer , Character >> tGroups = new TreeMap <>(weightKeyOrder );
377+ TreeMap <String , TreeMap <Integer , Integer >> tGroups = new TreeMap <>(weightKeyOrder );
369378 for (WeightRow row : rows ) {
370379 if (!row .isEmpty () && !row .isSpace ()) {
371- tGroups .computeIfAbsent (row .weightT (), k -> new TreeMap <>()).put (( int ) row .c (), row .c ());
380+ tGroups .computeIfAbsent (row .weightT (), k -> new TreeMap <>()).put (row .c (), row .c ());
372381 }
373382 }
374- Map <Character , Long > tRank = new HashMap <>();
375- Map <Character , Character > tEquiv = new HashMap <>();
383+ Map <Integer , Long > tRank = new HashMap <>();
384+ Map <Integer , Integer > tEquiv = new HashMap <>();
376385 long tRankCounter = 0 ;
377- for (TreeMap <Integer , Character > group : tGroups .values ()) {
378- char equiv = group .firstEntry ().getValue ();
379- for (Character c : group .values ()) {
386+ for (TreeMap <Integer , Integer > group : tGroups .values ()) {
387+ int equiv = group .firstEntry ().getValue ();
388+ for (Integer c : group .values ()) {
380389 tRank .put (c , tRankCounter );
381390 tEquiv .put (c , equiv );
382391 }
@@ -386,9 +395,9 @@ private static CollationMapper fromResultSetWithWeights(
386395 // Phase 3: build CollationOrderRow objects and feed the mapper builder.
387396 Builder builder = builder (collationReference );
388397 for (WeightRow row : rows ) {
389- char equivChar = ntEquiv .getOrDefault (row .c (), row .c ());
398+ int equivChar = ntEquiv .getOrDefault (row .c (), row .c ());
390399 long codepointRank = ntRank .getOrDefault (row .c (), 0L );
391- char equivCharPs = tEquiv .getOrDefault (row .c (), row .c ());
400+ int equivCharPs = tEquiv .getOrDefault (row .c (), row .c ());
392401 long codepointRankPs = tRank .getOrDefault (row .c (), 0L );
393402
394403 builder .addCharacter (
@@ -427,16 +436,16 @@ public static CollationMapper fromResultSetWithRanks(
427436 if (charsetChar == null || charsetChar .isEmpty ()) {
428437 continue ;
429438 }
439+ int c = charsetChar .codePointAt (0 );
430440 Preconditions .checkArgument (
431- charsetChar .length () == 1 ,
441+ charsetChar .length () == Character . charCount ( c ) ,
432442 "Expected single character from collation query, got: %s" ,
433443 charsetChar );
434- char c = charsetChar .charAt (0 );
435444 long rankVal = rs .getLong (CollationsOrderQueryColumns .CODEPOINT_RANK_COL );
436445 long rankPsVal = rs .getLong (CollationsOrderQueryColumns .CODEPOINT_RANK_PAD_SPACE_COL );
437446 boolean isEmpty = rs .getBoolean (CollationsOrderQueryColumns .IS_EMPTY_COL );
438447 boolean isSpace = rs .getBoolean (CollationsOrderQueryColumns .IS_SPACE_COL );
439- rows .add (new RankRow (( int ) c , c , rankVal , rankPsVal , isEmpty , isSpace ));
448+ rows .add (new RankRow (c , c , rankVal , rankPsVal , isEmpty , isSpace ));
440449 }
441450
442451 // Phase 2: compute equivalent characters.
@@ -457,12 +466,10 @@ public static CollationMapper fromResultSetWithRanks(
457466 // Phase 3: build CollationOrderRow objects and feed the mapper builder.
458467 Builder builder = builder (collationReference );
459468 for (RankRow row : rows ) {
460- char equivChar = row .isEmpty () ? row .c () : ( char ) ( int ) rankToMinCodepoint .get (row .rank ());
469+ int equivChar = row .isEmpty () ? row .c () : rankToMinCodepoint .get (row .rank ());
461470 // Space chars are not added to the trailing PAD-SPACE index; use self as placeholder.
462- char equivCharPs =
463- (row .isEmpty () || row .isSpace ())
464- ? row .c ()
465- : (char ) (int ) rankPsToMinCodepoint .get (row .rankPs ());
471+ int equivCharPs =
472+ (row .isEmpty () || row .isSpace ()) ? row .c () : rankPsToMinCodepoint .get (row .rankPs ());
466473
467474 builder .addCharacter (
468475 CollationOrderRow .builder ()
@@ -484,35 +491,35 @@ private long getCharsetSize(boolean lastCharacter) {
484491 : this .allPositionsIndex ().getCharsetSize ();
485492 }
486493
487- private long getOrdinalPosition (Character c , boolean lastCharacter ) {
494+ private long getOrdinalPosition (Integer c , boolean lastCharacter ) {
488495 return (lastCharacter && collationReference ().padSpace ())
489496 ? this .trailingPositionsPadSpace ().getOrdinalPosition (c )
490497 : this .allPositionsIndex ().getOrdinalPosition (c );
491498 }
492499
493- private Character getCharacterFromPosition (long ordinalPosition , boolean firstIteration ) {
500+ private Integer getCharacterFromPosition (long ordinalPosition , boolean firstIteration ) {
494501 return (firstIteration && collationReference ().padSpace ())
495502 ? this .trailingPositionsPadSpace ().getCharacterFromPosition (ordinalPosition )
496503 : this .allPositionsIndex ().getCharacterFromPosition (ordinalPosition );
497504 }
498505
499506 /** Internal data holder used by {@link #fromResultSetWithWeights}. */
500507 private static final class WeightRow {
501- private final char c ;
508+ private final int c ;
502509 private final String weightNt ;
503510 private final String weightT ;
504511 private final boolean isEmpty ;
505512 private final boolean isSpace ;
506513
507- WeightRow (char c , String weightNt , String weightT , boolean isEmpty , boolean isSpace ) {
514+ WeightRow (int c , String weightNt , String weightT , boolean isEmpty , boolean isSpace ) {
508515 this .c = c ;
509516 this .weightNt = weightNt ;
510517 this .weightT = weightT ;
511518 this .isEmpty = isEmpty ;
512519 this .isSpace = isSpace ;
513520 }
514521
515- char c () {
522+ int c () {
516523 return c ;
517524 }
518525
@@ -536,13 +543,13 @@ boolean isSpace() {
536543 /** Internal data holder used by {@link #fromResultSetWithRanks}. */
537544 private static final class RankRow {
538545 private final int codepoint ;
539- private final char c ;
546+ private final int c ;
540547 private final long rank ;
541548 private final long rankPs ;
542549 private final boolean isEmpty ;
543550 private final boolean isSpace ;
544551
545- RankRow (int codepoint , char c , long rank , long rankPs , boolean isEmpty , boolean isSpace ) {
552+ RankRow (int codepoint , int c , long rank , long rankPs , boolean isEmpty , boolean isSpace ) {
546553 this .codepoint = codepoint ;
547554 this .c = c ;
548555 this .rank = rank ;
@@ -555,7 +562,7 @@ int codepoint() {
555562 return codepoint ;
556563 }
557564
558- char c () {
565+ int c () {
559566 return c ;
560567 }
561568
@@ -587,9 +594,9 @@ public abstract static class Builder {
587594
588595 abstract CollationIndex .Builder trailingPositionsPadSpaceBuilder ();
589596
590- abstract ImmutableSet .Builder <Character > emptyCharactersBuilder ();
597+ abstract ImmutableSet .Builder <Integer > emptyCharactersBuilder ();
591598
592- abstract ImmutableSet .Builder <Character > spaceCharactersBuilder ();
599+ abstract ImmutableSet .Builder <Integer > spaceCharactersBuilder ();
593600
594601 public Builder addCharacter (CollationOrderRow collationOrderRow ) {
595602
0 commit comments