Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public abstract class JDBCBaseIT extends TemplateTestBase {
private static final String MYSQL_VERSION = "8.0.30";
private static final String POSTGRES_VERSION = "42.6.1";
private static final String ORACLE_VERSION = "23.9.0.25.07";
private static final String MSSQL_VERSION = "13.2.1.jre11";
private static final String MSSQL_VERSION = "13.4.0.jre11";

@Before
public void setUpJDBC() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import com.google.cloud.teleport.v2.source.reader.io.jdbc.JdbcSchemaReference;
import com.google.cloud.teleport.v2.source.reader.io.jdbc.dialectadapter.DialectAdapter;
import com.google.cloud.teleport.v2.source.reader.io.jdbc.rowmapper.JdbcSourceRowMapper;
import com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.stringmapper.CollationOrderRow.CollationsOrderQueryColumns;
import com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.UniformSplitterDBAdapter.CollationQueryResultType;
import com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.stringmapper.CollationReference;
import com.google.cloud.teleport.v2.source.reader.io.schema.SourceColumnIndexInfo;
import com.google.cloud.teleport.v2.source.reader.io.schema.SourceColumnIndexInfo.IndexType;
Expand Down Expand Up @@ -467,10 +467,9 @@ protected String getPadSpaceString(ResultSet resultSet) throws SQLException {
return resultSet.getString(InformationSchemaStatsCols.PAD_SPACE_COL);
}
}
// For MySql5.7 there is no pad-space column in the INFORMATION_SCHEMA.COLLATIONS table.
// In these older versions, non-binary string comparisons (like VARCHAR) always follow
// PAD SPACE rules (where trailing spaces are ignored). We default to this behavior
// to ensure correct partitioning across both MySQL 5.7 and 8.x.
// MySQL 5.7 does not have a PAD_ATTRIBUTE column in INFORMATION_SCHEMA.COLLATIONS.
// In 5.7 all non-binary string comparisons follow PAD SPACE rules (trailing spaces ignored).
// We default to that behaviour so that partitioning is correct on both 5.7 and 8.x.
logger.info(
"Did not find {} column in INFORMATION_SCHEMA.COLLATIONS table. Assuming PAD-SPACE collation for non-binary strings as per MySQL5.7 spec",
InformationSchemaStatsCols.PAD_SPACE_COL);
Expand Down Expand Up @@ -511,8 +510,6 @@ private SourceColumnIndexInfo resultSetToSourceColumnIndexInfo(ResultSet rs) thr
padSpace,
numericScale,
datetimePrecision);
// TODO(vardhanvthigle): MySql 5.7 is always PAD space and does not have PAD_ATTRIBUTE
// Column.
String columType = normalizeColumnType(rs.getString(InformationSchemaStatsCols.TYPE_COL));
IndexType indexType = INDEX_TYPE_MAPPING.getOrDefault(columType, IndexType.OTHER);
CollationReference collationReference = null;
Expand Down Expand Up @@ -727,32 +724,55 @@ public boolean checkForTimeout(SQLException exception) {
}

/**
* Get Query that returns order of collation. The query must return all the characters in the
* character set with the columns listed in {@link CollationsOrderQueryColumns}.
* Get Query that returns order of collation. The query returns one row per valid character with
* columns {@code charset_char}, {@code weight_non_trailing}, {@code weight_trailing}, {@code
* is_empty}, and {@code is_space}. All grouping, ranking and equivalent-character resolution is
* performed in Java by {@link
* com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.stringmapper.CollationMapper}.
*
* <p>The query uses {@code WEIGHT_STRING()} (available since MySQL 5.6) and plain {@code CROSS
* JOIN} hex-nibble tables, making it fully compatible with MySQL 5.7+. No window functions, no
* {@code SET} variables, and no {@code PREPARE}/{@code EXECUTE} are required.
*
* @param dbCharset character set used by the database for which collation ordering has to be
* found.
* @param dbCollation collation set used by the database for which collation ordering has to be
* found.
* @param padSpace pad space used by the database for which collation ordering has to be found.
* @param dbCollation collation used by the database for which collation ordering has to be found.
* @param padSpace pad space attribute of the collation.
*/
@Override
public String getCollationsOrderQuery(String dbCharset, String dbCollation, boolean padSpace) {
String query = resourceAsString(COLLATIONS_QUERY_RESOURCE_PATH);
Map<String, String> tags = new HashMap<>();
tags.put("'" + CHARSET_REPLACEMENT_TAG + "'", "'" + dbCharset + "'");
tags.put("'" + COLLATION_REPLACEMENT_TAG + "'", "'" + dbCollation + "'");
// Queries with size > max_allowed_packet get rejected by
// the db. max_allowed_packet is generally around 16Mb which is a lot for our use case.
// The SQL template uses bare tags (no surrounding quotes) because the charset and collation
// names appear in USING and COLLATE clauses which do not accept quoted identifiers.
tags.put(CHARSET_REPLACEMENT_TAG, dbCharset);
tags.put(COLLATION_REPLACEMENT_TAG, dbCollation);
return replaceTagsAndSanitize(query, tags);
}

@Override
public CollationQueryResultType collationQueryResultType() {
return CollationQueryResultType.WEIGHT_BYTES;
}

/**
* Version of MySql. As of now the code does not need to distinguish between versions of Mysql.
* Having the type allows the implementation do finer distinctions if needed in the future.
* Version of MySql.
*
* <p>The collation order query (used for string range splitting) works on both {@link #DEFAULT}
* (MySQL 8.0+) and {@link #MYSQL_5_7} using the same SQL file. The file uses temporary tables and
* GROUP BY joins instead of window functions (FIRST_VALUE / DENSE_RANK), which are only available
* in MySQL 8.0+. This approach is also a performance improvement on 8.0 because the expensive
* codepoint cross-join is materialised once rather than being re-evaluated as a nested subquery
* inside each window partition.
*
* <p>The count query uses the {@code MAX_EXECUTION_TIME} optimizer hint, which is supported from
* MySQL 5.7.8+. For earlier 5.7 patch releases the hint is silently ignored by MySQL.
*/
public enum MySqlVersion {
/** MySQL 8.0 and later (default). */
DEFAULT,
/** MySQL 5.7.x. */
MYSQL_5_7,
}

protected static final class InformationSchemaCols {
Expand Down Expand Up @@ -783,7 +803,7 @@ protected static final class InformationSchemaStatsCols {
public static final String COLLATION_COL = "cols.COLLATION_NAME";
public static final String DATETIME_PRECISION_COL = "cols.DATETIME_PRECISION";

// TODO(vardhanvthigle): MySql 5.7 is always PAD space and does not have PAD_ATTRIBUTE Column.
// MySQL 5.7 does not expose PAD_ATTRIBUTE; the adapter defaults to PAD SPACE in that case.
public static final String PAD_SPACE_COL = "collations.PAD_ATTRIBUTE";

public static final String NUMERIC_SCALE_COL = "cols.NUMERIC_SCALE";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,32 @@ String getCountQuery(
*/
boolean checkForTimeout(SQLException exception);

/**
* Describes the shape of the result set returned by {@link #getCollationsOrderQuery}.
*
* <ul>
* <li>{@link #WEIGHT_BYTES} – the query returns raw {@code WEIGHT_STRING} sort-key bytes for
* each character (columns {@code weight_non_trailing}, {@code weight_trailing}, {@code
* is_empty}, {@code is_space}). Java performs all grouping, ranking and
* equivalent-character resolution. Used by the MySQL adapter.
* <li>{@link #WITH_RANKS} – the query returns pre-computed dense ranks ({@code codepoint_rank},
* {@code codepoint_rank_pad_space}) together with {@code is_empty} and {@code is_space}.
* Java resolves equivalent characters from the rank groups. Used by the PostgreSQL adapter.
* </ul>
*/
enum CollationQueryResultType {
WEIGHT_BYTES,
WITH_RANKS
}

/**
* Returns the type of result produced by {@link #getCollationsOrderQuery}. Defaults to {@link
* CollationQueryResultType#WITH_RANKS}.
*/
default CollationQueryResultType collationQueryResultType() {
return CollationQueryResultType.WITH_RANKS;
}

/**
* Get a query that returns order of collation. The query must return all the characters in the
* character set with the columns listed in {@link CollationsOrderQueryColumns}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,18 @@ public abstract class CollationIndex implements Serializable {
public abstract CollationIndexType indexType();

/**
* Map of character to it's index position based on collation order. Helps us map a string to big
* integer.
* Map of character (codepoint) to it's index position based on collation order. Helps us map a
* string to big integer.
*/
public abstract ImmutableMap<Character, Long> characterToIndex();
public abstract ImmutableMap<Integer, Long> characterToIndex();

/**
* Map if Index back to character based on collation order. Helps us unmap a big integer to
* string. Note this maps the index back to a minimum set of characters. For example in
* Map if Index back to character (codepoint) based on collation order. Helps us unmap a big
* integer to string. Note this maps the index back to a minimum set of characters. For example in
* case-insensitive collations, 'a' and 'A' will have the same index in {@link
* #characterToIndex()} and {@link #indexToCharacter()} will map the index to 'A'.
*/
public abstract ImmutableMap<Long, Character> indexToCharacter();
public abstract ImmutableMap<Long, Integer> indexToCharacter();

public static CollationIndex.Builder builder() {
return new AutoValue_CollationIndex.Builder();
Expand All @@ -61,11 +61,11 @@ public long getCharsetSize() {
return indexToCharacter().size();
}

public long getOrdinalPosition(Character c) {
public long getOrdinalPosition(Integer c) {
return characterToIndex().get(c);
}

public Character getCharacterFromPosition(Long position) {
public Integer getCharacterFromPosition(Long position) {
return indexToCharacter().get(position);
}

Expand All @@ -80,15 +80,15 @@ public abstract static class Builder {

abstract CollationIndexType indexType();

private Map<Character, Long> charToIndexCache = new HashMap<>();
private Map<Long, Character> indexToCharacterCache = new HashMap<>();
private Map<Character, Long> indexToCharacterReverseCache = new HashMap<>();
private Map<Integer, Long> charToIndexCache = new HashMap<>();
private Map<Long, Integer> indexToCharacterCache = new HashMap<>();
private Map<Integer, Long> indexToCharacterReverseCache = new HashMap<>();

abstract Builder setIndexToCharacter(ImmutableMap<Long, Character> value);
abstract Builder setIndexToCharacter(ImmutableMap<Long, Integer> value);

abstract Builder setCharacterToIndex(ImmutableMap<Character, Long> value);
abstract Builder setCharacterToIndex(ImmutableMap<Integer, Long> value);

public Builder addCharacter(Character charsetChar, Character equivalentChar, Long index) {
public Builder addCharacter(Integer charsetChar, Integer equivalentChar, Long index) {
logger.debug(
"Registering character order for {}, index-type = {}, character = {}, equivalentCharacter = {}, index = {}, isBlank = {}",
collationReference(),
Expand Down Expand Up @@ -185,7 +185,9 @@ public CollationIndex build() {
+ "index-type = "
+ indexType());
}
if (charToIndexCache.get(indexToCharacterCache.get(indexes.get(i))) != indexes.get(i)) {
if (!charToIndexCache
.get(indexToCharacterCache.get(indexes.get(i)))
.equals(indexes.get(i))) {
throw new IllegalStateException(
"index not mapping onto itself found at position "
+ i
Expand Down
Loading
Loading