GoogleCloudPlatform · sm745052 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 18, 2026
diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/JDBCBaseIT.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/JDBCBaseIT.java
@@ -58,7 +58,7 @@ public abstract class JDBCBaseIT extends TemplateTestBase {
   private static final String MYSQL_VERSION = "8.0.30";
   private static final String POSTGRES_VERSION = "42.6.1";
   private static final String ORACLE_VERSION = "23.9.0.25.07";
-  private static final String MSSQL_VERSION = "13.2.1.jre11";
+  private static final String MSSQL_VERSION = "13.4.0.jre11";
 
   @Before
   public void setUpJDBC() throws IOException {

@@ -27,7 +27,7 @@
 import com.google.cloud.teleport.v2.source.reader.io.jdbc.JdbcSchemaReference;
 import com.google.cloud.teleport.v2.source.reader.io.jdbc.dialectadapter.DialectAdapter;
 import com.google.cloud.teleport.v2.source.reader.io.jdbc.rowmapper.JdbcSourceRowMapper;
-import com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.stringmapper.CollationOrderRow.CollationsOrderQueryColumns;
+import com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.UniformSplitterDBAdapter.CollationQueryResultType;
 import com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.stringmapper.CollationReference;
 import com.google.cloud.teleport.v2.source.reader.io.schema.SourceColumnIndexInfo;
 import com.google.cloud.teleport.v2.source.reader.io.schema.SourceColumnIndexInfo.IndexType;
@@ -467,10 +467,9 @@ protected String getPadSpaceString(ResultSet resultSet) throws SQLException {
         return resultSet.getString(InformationSchemaStatsCols.PAD_SPACE_COL);
       }
     }
-    // For MySql5.7 there is no pad-space column in the INFORMATION_SCHEMA.COLLATIONS table.
-    // In these older versions, non-binary string comparisons (like VARCHAR) always follow
-    // PAD SPACE rules (where trailing spaces are ignored). We default to this behavior
-    // to ensure correct partitioning across both MySQL 5.7 and 8.x.
+    // MySQL 5.7 does not have a PAD_ATTRIBUTE column in INFORMATION_SCHEMA.COLLATIONS.
+    // In 5.7 all non-binary string comparisons follow PAD SPACE rules (trailing spaces ignored).
+    // We default to that behaviour so that partitioning is correct on both 5.7 and 8.x.
     logger.info(
         "Did not find {} column in INFORMATION_SCHEMA.COLLATIONS table. Assuming PAD-SPACE collation for non-binary strings as per MySQL5.7 spec",
         InformationSchemaStatsCols.PAD_SPACE_COL);
@@ -511,8 +510,6 @@ private SourceColumnIndexInfo resultSetToSourceColumnIndexInfo(ResultSet rs) thr
         padSpace,
         numericScale,
         datetimePrecision);
-    // TODO(vardhanvthigle): MySql 5.7 is always PAD space and does not have PAD_ATTRIBUTE
-    // Column.
     String columType = normalizeColumnType(rs.getString(InformationSchemaStatsCols.TYPE_COL));
     IndexType indexType = INDEX_TYPE_MAPPING.getOrDefault(columType, IndexType.OTHER);
     CollationReference collationReference = null;
@@ -727,32 +724,55 @@ public boolean checkForTimeout(SQLException exception) {
   }
 
   /**
-   * Get Query that returns order of collation. The query must return all the characters in the
-   * character set with the columns listed in {@link CollationsOrderQueryColumns}.
+   * Get Query that returns order of collation. The query returns one row per valid character with
+   * columns {@code charset_char}, {@code weight_non_trailing}, {@code weight_trailing}, {@code
+   * is_empty}, and {@code is_space}. All grouping, ranking and equivalent-character resolution is
+   * performed in Java by {@link
+   * com.google.cloud.teleport.v2.source.reader.io.jdbc.uniformsplitter.stringmapper.CollationMapper}.
+   *
+   * <p>The query uses {@code WEIGHT_STRING()} (available since MySQL 5.6) and plain {@code CROSS
+   * JOIN} hex-nibble tables, making it fully compatible with MySQL 5.7+. No window functions, no
+   * {@code SET} variables, and no {@code PREPARE}/{@code EXECUTE} are required.
    *
    * @param dbCharset character set used by the database for which collation ordering has to be
    *     found.
-   * @param dbCollation collation set used by the database for which collation ordering has to be
-   *     found.
-   * @param padSpace pad space used by the database for which collation ordering has to be found.
+   * @param dbCollation collation used by the database for which collation ordering has to be found.
+   * @param padSpace pad space attribute of the collation.
    */
   @Override
   public String getCollationsOrderQuery(String dbCharset, String dbCollation, boolean padSpace) {
     String query = resourceAsString(COLLATIONS_QUERY_RESOURCE_PATH);
     Map<String, String> tags = new HashMap<>();
-    tags.put("'" + CHARSET_REPLACEMENT_TAG + "'", "'" + dbCharset + "'");
-    tags.put("'" + COLLATION_REPLACEMENT_TAG + "'", "'" + dbCollation + "'");
-    // Queries with size > max_allowed_packet get rejected by
-    // the db. max_allowed_packet is generally around 16Mb which is a lot for our use case.
+    // The SQL template uses bare tags (no surrounding quotes) because the charset and collation
+    // names appear in USING and COLLATE clauses which do not accept quoted identifiers.
+    tags.put(CHARSET_REPLACEMENT_TAG, dbCharset);
+    tags.put(COLLATION_REPLACEMENT_TAG, dbCollation);
     return replaceTagsAndSanitize(query, tags);
   }
 
+  @Override
+  public CollationQueryResultType collationQueryResultType() {
+    return CollationQueryResultType.WEIGHT_BYTES;
+  }
+
   /**
-   * Version of MySql. As of now the code does not need to distinguish between versions of Mysql.
-   * Having the type allows the implementation do finer distinctions if needed in the future.
+   * Version of MySql.
+   *
+   * <p>The collation order query (used for string range splitting) works on both {@link #DEFAULT}
+   * (MySQL 8.0+) and {@link #MYSQL_5_7} using the same SQL file. The file uses temporary tables and
+   * GROUP BY joins instead of window functions (FIRST_VALUE / DENSE_RANK), which are only available
+   * in MySQL 8.0+. This approach is also a performance improvement on 8.0 because the expensive
+   * codepoint cross-join is materialised once rather than being re-evaluated as a nested subquery
+   * inside each window partition.
+   *
+   * <p>The count query uses the {@code MAX_EXECUTION_TIME} optimizer hint, which is supported from
+   * MySQL 5.7.8+. For earlier 5.7 patch releases the hint is silently ignored by MySQL.
    */
   public enum MySqlVersion {
+    /** MySQL 8.0 and later (default). */
     DEFAULT,
+    /** MySQL 5.7.x. */
+    MYSQL_5_7,
   }
 
   protected static final class InformationSchemaCols {
@@ -783,7 +803,7 @@ protected static final class InformationSchemaStatsCols {
     public static final String COLLATION_COL = "cols.COLLATION_NAME";
     public static final String DATETIME_PRECISION_COL = "cols.DATETIME_PRECISION";
 
-    // TODO(vardhanvthigle): MySql 5.7 is always PAD space and does not have PAD_ATTRIBUTE Column.
+    // MySQL 5.7 does not expose PAD_ATTRIBUTE; the adapter defaults to PAD SPACE in that case.
     public static final String PAD_SPACE_COL = "collations.PAD_ATTRIBUTE";
 
     public static final String NUMERIC_SCALE_COL = "cols.NUMERIC_SCALE";

@@ -67,6 +67,32 @@ String getCountQuery(
    */
   boolean checkForTimeout(SQLException exception);
 
+  /**
+   * Describes the shape of the result set returned by {@link #getCollationsOrderQuery}.
+   *
+   * <ul>
+   *   <li>{@link #WEIGHT_BYTES} – the query returns raw {@code WEIGHT_STRING} sort-key bytes for
+   *       each character (columns {@code weight_non_trailing}, {@code weight_trailing}, {@code
+   *       is_empty}, {@code is_space}). Java performs all grouping, ranking and
+   *       equivalent-character resolution. Used by the MySQL adapter.
+   *   <li>{@link #WITH_RANKS} – the query returns pre-computed dense ranks ({@code codepoint_rank},
+   *       {@code codepoint_rank_pad_space}) together with {@code is_empty} and {@code is_space}.
+   *       Java resolves equivalent characters from the rank groups. Used by the PostgreSQL adapter.
+   * </ul>
+   */
+  enum CollationQueryResultType {
+    WEIGHT_BYTES,
+    WITH_RANKS
+  }
+
+  /**
+   * Returns the type of result produced by {@link #getCollationsOrderQuery}. Defaults to {@link
+   * CollationQueryResultType#WITH_RANKS}.
+   */
+  default CollationQueryResultType collationQueryResultType() {
+    return CollationQueryResultType.WITH_RANKS;
+  }
+
   /**
    * Get a query that returns order of collation. The query must return all the characters in the
    * character set with the columns listed in {@link CollationsOrderQueryColumns}.

@@ -40,18 +40,18 @@ public abstract class CollationIndex implements Serializable {
   public abstract CollationIndexType indexType();
 
   /**
-   * Map of character to it's index position based on collation order. Helps us map a string to big
-   * integer.
+   * Map of character (codepoint) to it's index position based on collation order. Helps us map a
+   * string to big integer.
    */
-  public abstract ImmutableMap<Character, Long> characterToIndex();
+  public abstract ImmutableMap<Integer, Long> characterToIndex();
 
   /**
-   * Map if Index back to character based on collation order. Helps us unmap a big integer to
-   * string. Note this maps the index back to a minimum set of characters. For example in
+   * Map if Index back to character (codepoint) based on collation order. Helps us unmap a big
+   * integer to string. Note this maps the index back to a minimum set of characters. For example in
    * case-insensitive collations, 'a' and 'A' will have the same index in {@link
    * #characterToIndex()} and {@link #indexToCharacter()} will map the index to 'A'.
    */
-  public abstract ImmutableMap<Long, Character> indexToCharacter();
+  public abstract ImmutableMap<Long, Integer> indexToCharacter();
 
   public static CollationIndex.Builder builder() {
     return new AutoValue_CollationIndex.Builder();
@@ -61,11 +61,11 @@ public long getCharsetSize() {
     return indexToCharacter().size();
   }
 
-  public long getOrdinalPosition(Character c) {
+  public long getOrdinalPosition(Integer c) {
     return characterToIndex().get(c);
   }
 
-  public Character getCharacterFromPosition(Long position) {
+  public Integer getCharacterFromPosition(Long position) {
     return indexToCharacter().get(position);
   }
 
@@ -80,15 +80,15 @@ public abstract static class Builder {
 
     abstract CollationIndexType indexType();
 
-    private Map<Character, Long> charToIndexCache = new HashMap<>();
-    private Map<Long, Character> indexToCharacterCache = new HashMap<>();
-    private Map<Character, Long> indexToCharacterReverseCache = new HashMap<>();
+    private Map<Integer, Long> charToIndexCache = new HashMap<>();
+    private Map<Long, Integer> indexToCharacterCache = new HashMap<>();
+    private Map<Integer, Long> indexToCharacterReverseCache = new HashMap<>();
 
-    abstract Builder setIndexToCharacter(ImmutableMap<Long, Character> value);
+    abstract Builder setIndexToCharacter(ImmutableMap<Long, Integer> value);
 
-    abstract Builder setCharacterToIndex(ImmutableMap<Character, Long> value);
+    abstract Builder setCharacterToIndex(ImmutableMap<Integer, Long> value);
 
-    public Builder addCharacter(Character charsetChar, Character equivalentChar, Long index) {
+    public Builder addCharacter(Integer charsetChar, Integer equivalentChar, Long index) {
       logger.debug(
           "Registering character order for {}, index-type = {}, character = {}, equivalentCharacter = {}, index = {}, isBlank = {}",
           collationReference(),
@@ -185,7 +185,9 @@ public CollationIndex build() {
                   + "index-type = "
                   + indexType());
         }
-        if (charToIndexCache.get(indexToCharacterCache.get(indexes.get(i))) != indexes.get(i)) {
+        if (!charToIndexCache
+            .get(indexToCharacterCache.get(indexes.get(i)))
+            .equals(indexes.get(i))) {
           throw new IllegalStateException(
               "index not mapping onto itself found at position "
                   + i