Skip to content

Commit dd4f125

Browse files
committed
4byte
1 parent ceb6a27 commit dd4f125

File tree

8 files changed

+181
-144
lines changed

8 files changed

+181
-144
lines changed

v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/jdbc/uniformsplitter/stringmapper/CollationIndex.java

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,18 @@ public abstract class CollationIndex implements Serializable {
4040
public abstract CollationIndexType indexType();
4141

4242
/**
43-
* Map of character to it's index position based on collation order. Helps us map a string to big
44-
* integer.
43+
* Map of character (codepoint) to it's index position based on collation order. Helps us map a
44+
* string to big integer.
4545
*/
46-
public abstract ImmutableMap<Character, Long> characterToIndex();
46+
public abstract ImmutableMap<Integer, Long> characterToIndex();
4747

4848
/**
49-
* Map if Index back to character based on collation order. Helps us unmap a big integer to
50-
* string. Note this maps the index back to a minimum set of characters. For example in
49+
* Map if Index back to character (codepoint) based on collation order. Helps us unmap a big
50+
* integer to string. Note this maps the index back to a minimum set of characters. For example in
5151
* case-insensitive collations, 'a' and 'A' will have the same index in {@link
5252
* #characterToIndex()} and {@link #indexToCharacter()} will map the index to 'A'.
5353
*/
54-
public abstract ImmutableMap<Long, Character> indexToCharacter();
54+
public abstract ImmutableMap<Long, Integer> indexToCharacter();
5555

5656
public static CollationIndex.Builder builder() {
5757
return new AutoValue_CollationIndex.Builder();
@@ -61,11 +61,11 @@ public long getCharsetSize() {
6161
return indexToCharacter().size();
6262
}
6363

64-
public long getOrdinalPosition(Character c) {
64+
public long getOrdinalPosition(Integer c) {
6565
return characterToIndex().get(c);
6666
}
6767

68-
public Character getCharacterFromPosition(Long position) {
68+
public Integer getCharacterFromPosition(Long position) {
6969
return indexToCharacter().get(position);
7070
}
7171

@@ -80,15 +80,15 @@ public abstract static class Builder {
8080

8181
abstract CollationIndexType indexType();
8282

83-
private Map<Character, Long> charToIndexCache = new HashMap<>();
84-
private Map<Long, Character> indexToCharacterCache = new HashMap<>();
85-
private Map<Character, Long> indexToCharacterReverseCache = new HashMap<>();
83+
private Map<Integer, Long> charToIndexCache = new HashMap<>();
84+
private Map<Long, Integer> indexToCharacterCache = new HashMap<>();
85+
private Map<Integer, Long> indexToCharacterReverseCache = new HashMap<>();
8686

87-
abstract Builder setIndexToCharacter(ImmutableMap<Long, Character> value);
87+
abstract Builder setIndexToCharacter(ImmutableMap<Long, Integer> value);
8888

89-
abstract Builder setCharacterToIndex(ImmutableMap<Character, Long> value);
89+
abstract Builder setCharacterToIndex(ImmutableMap<Integer, Long> value);
9090

91-
public Builder addCharacter(Character charsetChar, Character equivalentChar, Long index) {
91+
public Builder addCharacter(Integer charsetChar, Integer equivalentChar, Long index) {
9292
logger.debug(
9393
"Registering character order for {}, index-type = {}, character = {}, equivalentCharacter = {}, index = {}, isBlank = {}",
9494
collationReference(),

v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/jdbc/uniformsplitter/stringmapper/CollationMapper.java

Lines changed: 56 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public abstract class CollationMapper implements Serializable {
9191
* using utf8mb4), 'b') = 'ab' COLLATE <collation>;} returns 1. TODO(vardhanvthigle): Check this
9292
* behavior for PG and other databases.
9393
*/
94-
public abstract ImmutableSet<Character> emptyCharacters();
94+
public abstract ImmutableSet<Integer> emptyCharacters();
9595

9696
/**
9797
* Space Characters. MySQL ignores trailing space characters in comparisons for PAD space
@@ -100,11 +100,13 @@ public abstract class CollationMapper implements Serializable {
100100
* (UNHEX(C2H0)) when the collation is Pad Space. These have same behavior to ascii space as far
101101
* as trailing or non-trailing comparison is concerned.
102102
*/
103-
public abstract ImmutableSet<Character> spaceCharacters();
103+
public abstract ImmutableSet<Integer> spaceCharacters();
104104

105105
@Memoized
106106
String allSpaceCharacters() {
107-
return this.spaceCharacters().stream().map(String::valueOf).collect(Collectors.joining(""));
107+
return this.spaceCharacters().stream()
108+
.map(c -> new String(Character.toChars(c)))
109+
.collect(Collectors.joining(""));
108110
}
109111

110112
@Memoized
@@ -114,7 +116,9 @@ String emptyReplacePattern() {
114116
}
115117
return "["
116118
+ Pattern.quote(
117-
this.emptyCharacters().stream().map(String::valueOf).collect(Collectors.joining("")))
119+
this.emptyCharacters().stream()
120+
.map(c -> new String(Character.toChars(c)))
121+
.collect(Collectors.joining("")))
118122
+ "]";
119123
}
120124

@@ -160,11 +164,16 @@ public BigInteger mapString(@Nullable String element, int lengthToPad) {
160164
}
161165

162166
// Convert the string to BigInteger.
163-
for (int index = 0; index < element.length(); index++) {
164-
Character c = element.charAt(index);
167+
int length = element.codePointCount(0, element.length());
168+
int codePointIndex = 0;
169+
for (int index = 0; index < element.length(); ) {
170+
int codepoint = element.codePointAt(index);
171+
boolean isLast = (codePointIndex == length - 1);
165172
ret =
166-
ret.multiply(BigInteger.valueOf(getCharsetSize(index == (element.length() - 1))))
167-
.add(BigInteger.valueOf(getOrdinalPosition(c, index == (element.length() - 1))));
173+
ret.multiply(BigInteger.valueOf(getCharsetSize(isLast)))
174+
.add(BigInteger.valueOf(getOrdinalPosition(codepoint, isLast)));
175+
index += Character.charCount(codepoint);
176+
codePointIndex++;
168177
}
169178
for (int index = element.length(); index < lengthToPad; index++) {
170179
ret = ret.multiply(BigInteger.valueOf(getCharsetSize(index == (element.length() - 1))));
@@ -199,16 +208,16 @@ public String unMapString(BigInteger element) {
199208

200209
// Base Case that the string just represents single character
201210
if (element.equals(BigInteger.ZERO)) {
202-
char c = getCharacterFromPosition(element.longValue(), true);
203-
return String.valueOf(c);
211+
int c = getCharacterFromPosition(element.longValue(), true);
212+
return new String(Character.toChars(c));
204213
}
205214

206215
while (!element.equals(BigInteger.ZERO)) {
207216
long charsetSize = getCharsetSize(index == 0);
208217

209218
BigInteger reminder = element.mod(BigInteger.valueOf(charsetSize));
210-
char c = getCharacterFromPosition(reminder.longValue(), (index == 0));
211-
word.append(c);
219+
int c = getCharacterFromPosition(reminder.longValue(), (index == 0));
220+
word.appendCodePoint(c);
212221

213222
element = element.divide(BigInteger.valueOf(charsetSize));
214223
index++;
@@ -318,11 +327,11 @@ private static CollationMapper fromResultSetWithWeights(
318327
if (charsetChar == null || charsetChar.isEmpty()) {
319328
continue;
320329
}
330+
int c = charsetChar.codePointAt(0);
321331
Preconditions.checkArgument(
322-
charsetChar.length() == 1,
332+
charsetChar.length() == Character.charCount(c),
323333
"Expected single character from collation query, got: %s",
324334
charsetChar);
325-
char c = charsetChar.charAt(0);
326335
byte[] wNt = rs.getBytes(CollationsOrderQueryColumns.WEIGHT_NON_TRAILING_COL);
327336
byte[] wT = rs.getBytes(CollationsOrderQueryColumns.WEIGHT_TRAILING_COL);
328337
boolean isEmpty = rs.getBoolean(CollationsOrderQueryColumns.IS_EMPTY_COL);
@@ -332,7 +341,7 @@ private static CollationMapper fromResultSetWithWeights(
332341
if (wNt == null && !isEmpty) {
333342
logger.warn(
334343
"Skipping character codepoint={} for {} because weight_non_trailing is NULL",
335-
(int) c,
344+
c,
336345
collationReference);
337346
continue;
338347
}
@@ -343,19 +352,19 @@ private static CollationMapper fromResultSetWithWeights(
343352

344353
// Phase 2a: build all-positions equivalence groups from weight_non_trailing.
345354
// TreeMap gives groups in sorted weight order (= collation rank order).
346-
TreeMap<String, TreeMap<Integer, Character>> ntGroups = new TreeMap<>(weightKeyOrder);
355+
TreeMap<String, TreeMap<Integer, Integer>> ntGroups = new TreeMap<>(weightKeyOrder);
347356
for (WeightRow row : rows) {
348357
if (!row.isEmpty()) {
349-
ntGroups.computeIfAbsent(row.weightNt(), k -> new TreeMap<>()).put((int) row.c(), row.c());
358+
ntGroups.computeIfAbsent(row.weightNt(), k -> new TreeMap<>()).put(row.c(), row.c());
350359
}
351360
}
352361
// Assign dense ranks and resolve equivalent characters (min codepoint per group).
353-
Map<Character, Long> ntRank = new HashMap<>();
354-
Map<Character, Character> ntEquiv = new HashMap<>();
362+
Map<Integer, Long> ntRank = new HashMap<>();
363+
Map<Integer, Integer> ntEquiv = new HashMap<>();
355364
long rank = 0;
356-
for (TreeMap<Integer, Character> group : ntGroups.values()) {
357-
char equiv = group.firstEntry().getValue(); // smallest codepoint = canonical equivalent
358-
for (Character c : group.values()) {
365+
for (TreeMap<Integer, Integer> group : ntGroups.values()) {
366+
int equiv = group.firstEntry().getValue(); // smallest codepoint = canonical equivalent
367+
for (Integer c : group.values()) {
359368
ntRank.put(c, rank);
360369
ntEquiv.put(c, equiv);
361370
}
@@ -365,18 +374,18 @@ private static CollationMapper fromResultSetWithWeights(
365374
// Phase 2b: build PAD-SPACE trailing equivalence groups from weight_trailing.
366375
// Space characters (is_space=true) are intentionally excluded; they are tracked in
367376
// spaceCharacters and stripped before the trailing index is consulted.
368-
TreeMap<String, TreeMap<Integer, Character>> tGroups = new TreeMap<>(weightKeyOrder);
377+
TreeMap<String, TreeMap<Integer, Integer>> tGroups = new TreeMap<>(weightKeyOrder);
369378
for (WeightRow row : rows) {
370379
if (!row.isEmpty() && !row.isSpace()) {
371-
tGroups.computeIfAbsent(row.weightT(), k -> new TreeMap<>()).put((int) row.c(), row.c());
380+
tGroups.computeIfAbsent(row.weightT(), k -> new TreeMap<>()).put(row.c(), row.c());
372381
}
373382
}
374-
Map<Character, Long> tRank = new HashMap<>();
375-
Map<Character, Character> tEquiv = new HashMap<>();
383+
Map<Integer, Long> tRank = new HashMap<>();
384+
Map<Integer, Integer> tEquiv = new HashMap<>();
376385
long tRankCounter = 0;
377-
for (TreeMap<Integer, Character> group : tGroups.values()) {
378-
char equiv = group.firstEntry().getValue();
379-
for (Character c : group.values()) {
386+
for (TreeMap<Integer, Integer> group : tGroups.values()) {
387+
int equiv = group.firstEntry().getValue();
388+
for (Integer c : group.values()) {
380389
tRank.put(c, tRankCounter);
381390
tEquiv.put(c, equiv);
382391
}
@@ -386,9 +395,9 @@ private static CollationMapper fromResultSetWithWeights(
386395
// Phase 3: build CollationOrderRow objects and feed the mapper builder.
387396
Builder builder = builder(collationReference);
388397
for (WeightRow row : rows) {
389-
char equivChar = ntEquiv.getOrDefault(row.c(), row.c());
398+
int equivChar = ntEquiv.getOrDefault(row.c(), row.c());
390399
long codepointRank = ntRank.getOrDefault(row.c(), 0L);
391-
char equivCharPs = tEquiv.getOrDefault(row.c(), row.c());
400+
int equivCharPs = tEquiv.getOrDefault(row.c(), row.c());
392401
long codepointRankPs = tRank.getOrDefault(row.c(), 0L);
393402

394403
builder.addCharacter(
@@ -427,16 +436,16 @@ public static CollationMapper fromResultSetWithRanks(
427436
if (charsetChar == null || charsetChar.isEmpty()) {
428437
continue;
429438
}
439+
int c = charsetChar.codePointAt(0);
430440
Preconditions.checkArgument(
431-
charsetChar.length() == 1,
441+
charsetChar.length() == Character.charCount(c),
432442
"Expected single character from collation query, got: %s",
433443
charsetChar);
434-
char c = charsetChar.charAt(0);
435444
long rankVal = rs.getLong(CollationsOrderQueryColumns.CODEPOINT_RANK_COL);
436445
long rankPsVal = rs.getLong(CollationsOrderQueryColumns.CODEPOINT_RANK_PAD_SPACE_COL);
437446
boolean isEmpty = rs.getBoolean(CollationsOrderQueryColumns.IS_EMPTY_COL);
438447
boolean isSpace = rs.getBoolean(CollationsOrderQueryColumns.IS_SPACE_COL);
439-
rows.add(new RankRow((int) c, c, rankVal, rankPsVal, isEmpty, isSpace));
448+
rows.add(new RankRow(c, c, rankVal, rankPsVal, isEmpty, isSpace));
440449
}
441450

442451
// Phase 2: compute equivalent characters.
@@ -457,12 +466,10 @@ public static CollationMapper fromResultSetWithRanks(
457466
// Phase 3: build CollationOrderRow objects and feed the mapper builder.
458467
Builder builder = builder(collationReference);
459468
for (RankRow row : rows) {
460-
char equivChar = row.isEmpty() ? row.c() : (char) (int) rankToMinCodepoint.get(row.rank());
469+
int equivChar = row.isEmpty() ? row.c() : rankToMinCodepoint.get(row.rank());
461470
// Space chars are not added to the trailing PAD-SPACE index; use self as placeholder.
462-
char equivCharPs =
463-
(row.isEmpty() || row.isSpace())
464-
? row.c()
465-
: (char) (int) rankPsToMinCodepoint.get(row.rankPs());
471+
int equivCharPs =
472+
(row.isEmpty() || row.isSpace()) ? row.c() : rankPsToMinCodepoint.get(row.rankPs());
466473

467474
builder.addCharacter(
468475
CollationOrderRow.builder()
@@ -484,35 +491,35 @@ private long getCharsetSize(boolean lastCharacter) {
484491
: this.allPositionsIndex().getCharsetSize();
485492
}
486493

487-
private long getOrdinalPosition(Character c, boolean lastCharacter) {
494+
private long getOrdinalPosition(Integer c, boolean lastCharacter) {
488495
return (lastCharacter && collationReference().padSpace())
489496
? this.trailingPositionsPadSpace().getOrdinalPosition(c)
490497
: this.allPositionsIndex().getOrdinalPosition(c);
491498
}
492499

493-
private Character getCharacterFromPosition(long ordinalPosition, boolean firstIteration) {
500+
private Integer getCharacterFromPosition(long ordinalPosition, boolean firstIteration) {
494501
return (firstIteration && collationReference().padSpace())
495502
? this.trailingPositionsPadSpace().getCharacterFromPosition(ordinalPosition)
496503
: this.allPositionsIndex().getCharacterFromPosition(ordinalPosition);
497504
}
498505

499506
/** Internal data holder used by {@link #fromResultSetWithWeights}. */
500507
private static final class WeightRow {
501-
private final char c;
508+
private final int c;
502509
private final String weightNt;
503510
private final String weightT;
504511
private final boolean isEmpty;
505512
private final boolean isSpace;
506513

507-
WeightRow(char c, String weightNt, String weightT, boolean isEmpty, boolean isSpace) {
514+
WeightRow(int c, String weightNt, String weightT, boolean isEmpty, boolean isSpace) {
508515
this.c = c;
509516
this.weightNt = weightNt;
510517
this.weightT = weightT;
511518
this.isEmpty = isEmpty;
512519
this.isSpace = isSpace;
513520
}
514521

515-
char c() {
522+
int c() {
516523
return c;
517524
}
518525

@@ -536,13 +543,13 @@ boolean isSpace() {
536543
/** Internal data holder used by {@link #fromResultSetWithRanks}. */
537544
private static final class RankRow {
538545
private final int codepoint;
539-
private final char c;
546+
private final int c;
540547
private final long rank;
541548
private final long rankPs;
542549
private final boolean isEmpty;
543550
private final boolean isSpace;
544551

545-
RankRow(int codepoint, char c, long rank, long rankPs, boolean isEmpty, boolean isSpace) {
552+
RankRow(int codepoint, int c, long rank, long rankPs, boolean isEmpty, boolean isSpace) {
546553
this.codepoint = codepoint;
547554
this.c = c;
548555
this.rank = rank;
@@ -555,7 +562,7 @@ int codepoint() {
555562
return codepoint;
556563
}
557564

558-
char c() {
565+
int c() {
559566
return c;
560567
}
561568

@@ -587,9 +594,9 @@ public abstract static class Builder {
587594

588595
abstract CollationIndex.Builder trailingPositionsPadSpaceBuilder();
589596

590-
abstract ImmutableSet.Builder<Character> emptyCharactersBuilder();
597+
abstract ImmutableSet.Builder<Integer> emptyCharactersBuilder();
591598

592-
abstract ImmutableSet.Builder<Character> spaceCharactersBuilder();
599+
abstract ImmutableSet.Builder<Integer> spaceCharactersBuilder();
593600

594601
public Builder addCharacter(CollationOrderRow collationOrderRow) {
595602

0 commit comments

Comments
 (0)