Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions apps/sim/lib/fractional-indexing/fractional-indexing.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/**
* @vitest-environment node
*
* Locks in the BYTEWISE ordering the rest of the stack depends on: `order_key` is
* stored `COLLATE "C"` (migration 0228) so Postgres compares keys the same way this
* library does. If the library's order ever diverged from ASCII byte order — e.g.
* across the `Z` (0x5A) < `a` (0x61) integer-head boundary, exactly where the
* `en_US.UTF-8` locale disagrees — inserts would mint keys that fail the `a >= b`
* assertion and rows would display out of order. These tests would catch that.
*/
import { describe, expect, it } from 'vitest'
import {
BASE_62_DIGITS,
generateKeyBetween,
generateNKeysBetween,
} from '@/lib/fractional-indexing/fractional-indexing'

/** Bytewise (ASCII / UTF-16 code-unit) comparator — what `COLLATE "C"` and the library use. */
const byteCompare = (a: string, b: string): number => (a < b ? -1 : a > b ? 1 : 0)

describe('fractional-indexing bytewise ordering', () => {
it('uses an alphabet in ascending char-code order with uppercase before lowercase', () => {
for (let i = 1; i < BASE_62_DIGITS.length; i++) {
expect(BASE_62_DIGITS.charCodeAt(i)).toBeGreaterThan(BASE_62_DIGITS.charCodeAt(i - 1))
}
// The boundary en_US.UTF-8 inverts: bytewise 'Z' < 'a', locale 'a' < 'Z'.
expect('Z' < 'a').toBe(true)
expect(BASE_62_DIGITS.indexOf('Z')).toBeLessThan(BASE_62_DIGITS.indexOf('a'))
})

it('produces strictly bytewise-increasing keys on repeated append', () => {
let prev: string | null = null
let last = ''
for (let i = 0; i < 200; i++) {
const key: string = generateKeyBetween(prev, null)
if (last) expect(byteCompare(last, key)).toBe(-1)
last = key
prev = key
}
})

it('stays bytewise-ordered when prepends cross the Z/a integer-head boundary', () => {
// Repeated prepend walks the integer head down out of 'a' into 'Z','Y',… —
// the exact uppercase region en_US.UTF-8 sorts wrong. The first prepend before
// "a0" already yields a 'Z'-headed key ("Zz").
const keys: string[] = []
let next: string | null = null
for (let i = 0; i < 60; i++) {
const key: string = generateKeyBetween(null, next)
keys.push(key)
next = key
}
const ascending = [...keys].reverse() // prepends are emitted largest → smallest
expect([...ascending].sort(byteCompare)).toEqual(ascending)
expect(new Set(keys).size).toBe(keys.length) // all distinct
// An uppercase-headed key sorts before the lowercase-headed "a0" at the tail.
expect(ascending.some((k) => k[0] >= 'A' && k[0] <= 'Z')).toBe(true)
expect(ascending[ascending.length - 1][0]).toBe('a')
})

it('mints a contiguous run that is bytewise-sorted and distinct', () => {
const keys = generateNKeysBetween(null, null, 500)
expect([...keys].sort(byteCompare)).toEqual(keys)
expect(new Set(keys).size).toBe(keys.length)
})

it('throws when bounds are out of order — the contract COLLATE "C" must satisfy', () => {
expect(() => generateKeyBetween('a1', 'a0')).toThrow()
expect(() => generateKeyBetween('a0', 'a0')).toThrow()
})
})
57 changes: 31 additions & 26 deletions apps/sim/lib/table/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import {
gte,
inArray,
isNull,
lt,
ne,
or,
type SQL,
Expand Down Expand Up @@ -1268,38 +1269,42 @@ async function resolveInsertByNeighbor(
}

if (afterRowId) {
// hi = the smallest key strictly after the anchor.
const [next] = await trx
.select({ orderKey: userTableRows.orderKey })
.from(userTableRows)
.where(
and(
eq(userTableRows.tableId, tableId),
sql`(${userTableRows.orderKey}, ${userTableRows.id}) > (${anchorKey}, ${afterRowId})`
)
)
.orderBy(asc(userTableRows.orderKey), asc(userTableRows.id))
.limit(1)
// hi = the smallest key strictly GREATER than the anchor key. Comparing keys
// (not the `(order_key, id)` row tuple) skips past any sibling that shares the
// anchor's key, so `keyBetween` always gets strictly-ordered bounds and can't
// throw on a stray duplicate. Identical to the row tuple when keys are distinct.
// A null anchorKey (flag off, un-backfilled) has no key to compare — leave the
// upper bound open, matching the prior best-effort behavior.
let nextKey: string | null = null
if (anchorKey !== null) {
const [next] = await trx
.select({ orderKey: userTableRows.orderKey })
.from(userTableRows)
.where(and(eq(userTableRows.tableId, tableId), gt(userTableRows.orderKey, anchorKey)))
.orderBy(asc(userTableRows.orderKey))
.limit(1)
nextKey = next?.orderKey ?? null
}
return {
orderKey: keyBetween(anchorKey, next?.orderKey ?? null),
orderKey: keyBetween(anchorKey, nextKey),
position: anchor.position + 1,
}
}

// beforeRowId: lo = the largest key strictly before the anchor.
const [prev] = await trx
.select({ orderKey: userTableRows.orderKey })
.from(userTableRows)
.where(
and(
eq(userTableRows.tableId, tableId),
sql`(${userTableRows.orderKey}, ${userTableRows.id}) < (${anchorKey}, ${beforeRowId})`
)
)
.orderBy(desc(userTableRows.orderKey), desc(userTableRows.id))
.limit(1)
// beforeRowId: lo = the largest key strictly LESS than the anchor key (distinct,
// same rationale as the afterRowId branch above).
let prevKey: string | null = null
if (anchorKey !== null) {
const [prev] = await trx
.select({ orderKey: userTableRows.orderKey })
.from(userTableRows)
.where(and(eq(userTableRows.tableId, tableId), lt(userTableRows.orderKey, anchorKey)))
.orderBy(desc(userTableRows.orderKey))
.limit(1)
prevKey = prev?.orderKey ?? null
}
return {
orderKey: keyBetween(prev?.orderKey ?? null, anchorKey),
orderKey: keyBetween(prevKey, anchorKey),
position: anchor.position,
}
}
Expand Down
168 changes: 168 additions & 0 deletions apps/sim/scripts/repair-table-order-key-collation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/usr/bin/env bun

/**
* One-off repair for `user_table_rows.order_key` rows mis-ordered by the
* collation bug fixed in migration 0228.
*
* Fractional `order_key`s are base-62 strings the fractional-indexing library
* compares BYTEWISE (ASCII: `0-9 < A-Z < a-z`). Before migration 0228 the column
* compared under the database's `en_US.UTF-8` locale, where lowercase interleaves
* with/precedes uppercase ("a0" < "Zz", the opposite of bytewise). Keys minted in
* that window were anchored to the wrong neighbors, so a table's keys can be
* out of order — or duplicated — under bytewise comparison. That makes inserts
* throw `generateKeyBetween`'s `a >= b` assertion and rows display out of order.
*
* This script finds every table whose `order_key`s are mis-assigned: walking rows
* in their authoritative `position, id` order, a row's key is `>=` the next row's
* under bytewise (`COLLATE "C"`) comparison. That covers swapped keys (a low
* position holding a bytewise-larger key than a higher one) and duplicates. Each
* flagged table is re-keyed from `position` order — the legacy authoritative order
* the original backfill also used — minting a fresh, evenly-spaced, distinct run
* with `nKeysBetween`.
*
* Distinct from `backfill-table-order-keys.ts`, which keys tables with NULL keys;
* this one repairs tables that are fully keyed but bytewise-disordered. Run it
* AFTER migration 0228 so the re-key writes and sorts under `COLLATE "C"`.
*
* Per-table-atomic: each table is re-keyed inside one transaction holding the
* same per-table advisory lock the app uses for inserts, so a concurrent insert
* can't interleave. Idempotent: a table whose keys are already distinct and
* ordered is never selected, so a re-run after a partial failure is safe.
*
* Usage:
* DATABASE_URL=... bun run apps/sim/scripts/repair-table-order-key-collation.ts
* DATABASE_URL=... bun run apps/sim/scripts/repair-table-order-key-collation.ts --dry-run
*/

import { userTableRows } from '@sim/db/schema'
import { getErrorMessage } from '@sim/utils/errors'
import { asc, eq, sql } from 'drizzle-orm'
import { drizzle } from 'drizzle-orm/postgres-js'
import postgres from 'postgres'
import { nKeysBetween } from '@/lib/table/order-key'

/** See backfill-table-order-keys.ts — keeps each VALUES list well under the param/stack ceilings. */
const WRITE_CHUNK_SIZE = 5000

export async function runRepair(): Promise<void> {
const dryRun = process.argv.includes('--dry-run')
const connectionString = process.env.DATABASE_URL ?? process.env.POSTGRES_URL
if (!connectionString) {
console.error('Missing DATABASE_URL or POSTGRES_URL')
process.exit(1)
}

const client = postgres(connectionString, {
prepare: false,
idle_timeout: 20,
connect_timeout: 30,
max: 5,
onnotice: () => {},
})
const db = drizzle(client)

const stats = { tables: 0, tablesKeyed: 0, rowsKeyed: 0, failed: 0 }

try {
// Tables with a bytewise (`COLLATE "C"`) inversion or duplicate among their
// non-null keys. Walking rows in their authoritative `position, id` order (the
// order the re-key writes), a healthy table has strictly INCREASING keys; flag
// any table where a row's key is `>=` the next row's. Ordering by `position`
// (not by `order_key`) is what makes this detect actual mis-assignment — e.g.
// pos 0 holding "a0" while pos 1 holds "Zz" (bytewise "Zz" < "a0") — and not
// just adjacent duplicates. The explicit `COLLATE "C"` keeps the comparison
// bytewise whether or not migration 0228 has been applied yet.
const pending = await db.execute<{ table_id: string }>(sql`
SELECT DISTINCT table_id FROM (
SELECT
table_id,
order_key,
LEAD(order_key) OVER (
PARTITION BY table_id ORDER BY position, id
) AS next_key
FROM user_table_rows
WHERE order_key IS NOT NULL
) t
WHERE next_key IS NOT NULL AND order_key COLLATE "C" >= next_key COLLATE "C"
Comment thread
cursor[bot] marked this conversation as resolved.
`)

console.log(
`Repair starting — ${pending.length} table(s) with mis-ordered keys${dryRun ? ' [DRY RUN]' : ''}`
)

for (const { table_id: tableId } of pending) {
stats.tables += 1
try {
if (dryRun) {
// Sizing only — count outside any transaction/lock so we never serialize
// live inserts on the table (taking the advisory lock just to count would
// make the dry run the opposite of safe).
const [row] = await db
.select({ rowCount: sql<number>`count(*)`.mapWith(Number) })
.from(userTableRows)
.where(eq(userTableRows.tableId, tableId))
stats.tablesKeyed += 1
stats.rowsKeyed += row.rowCount
console.log(` ${tableId}: would re-key ${row.rowCount} rows`)
continue
}

const keyed = await db.transaction(async (trx) => {
// Serialize with concurrent inserts on this table (same lock the app uses).
await trx.execute(
sql`SELECT pg_advisory_xact_lock(hashtextextended(${`user_table_rows_pos:${tableId}`}, 0))`
)
const rows = await trx
.select({ id: userTableRows.id })
.from(userTableRows)
.where(eq(userTableRows.tableId, tableId))
.orderBy(asc(userTableRows.position), asc(userTableRows.id))

if (rows.length === 0) return 0
const keys = nKeysBetween(null, null, rows.length)

// Chunked UPDATE … FROM (VALUES …) mapping id → key (see WRITE_CHUNK_SIZE).
for (let start = 0; start < rows.length; start += WRITE_CHUNK_SIZE) {
const chunk = rows.slice(start, start + WRITE_CHUNK_SIZE)
const values = sql.join(
chunk.map((r, i) => sql`(${r.id}, ${keys[start + i]})`),
sql`, `
)
await trx.execute(sql`
UPDATE user_table_rows AS t
SET order_key = v.order_key
FROM (VALUES ${values}) AS v(id, order_key)
WHERE t.id = v.id AND t.table_id = ${tableId}
Comment thread
TheodoreSpeaks marked this conversation as resolved.
`)
}
return rows.length
})
stats.tablesKeyed += 1
stats.rowsKeyed += keyed
console.log(` ${tableId}: re-keyed ${keyed} rows`)
Comment thread
TheodoreSpeaks marked this conversation as resolved.
} catch (error) {
stats.failed += 1
console.error(` ${tableId}: FAILED — ${getErrorMessage(error)}`)
}
Comment thread
TheodoreSpeaks marked this conversation as resolved.
}

const verb = dryRun ? 'to re-key' : 're-keyed'
console.log(`Repair complete.${dryRun ? ' [DRY RUN — no rows written]' : ''}`)
console.log(` tables scanned: ${stats.tables}`)
console.log(` tables ${verb}: ${stats.tablesKeyed}`)
console.log(` rows ${verb}: ${stats.rowsKeyed}`)
console.log(` failed: ${stats.failed}`)
if (stats.failed > 0) process.exitCode = 1
} finally {
await client.end({ timeout: 5 }).catch(() => {})
}
}

if ((import.meta as { main?: boolean }).main) {
try {
await runRepair()
} catch (error) {
console.error('Repair aborted:', getErrorMessage(error))
process.exitCode = 1
}
}
18 changes: 18 additions & 0 deletions packages/db/migrations/0228_order_key_binary_collation.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- Compare `user_table_rows.order_key` bytewise (COLLATE "C") instead of under the
-- database's default locale (en_US.UTF-8).
--
-- order_key holds fractional-index strings from the base-62 alphabet
-- 0-9 A-Z a-z, generated by apps/sim/lib/fractional-indexing. That algorithm
-- compares keys BYTEWISE (JS string `>=`) and its integer-length scheme relies on
-- the ASCII boundary `Z` (0x5A) < `a` (0x61). Under en_US.UTF-8 lowercase
-- interleaves with/precedes uppercase ("a0" < "Zz"), the opposite of bytewise — so
-- `max(order_key)`, neighbor lookups, and `ORDER BY order_key` disagreed with the
-- library, making inserts mint keys that fail `generateKeyBetween`'s `a >= b`
-- assertion and making rows display out of order.
--
-- Setting the column to COLLATE "C" makes every comparison, aggregate, and the
-- dependent index `user_table_rows_table_order_key_idx` use byte order, matching
-- the library. Postgres rebuilds that index under the new collation as part of the
-- ALTER. Takes a brief ACCESS EXCLUSIVE lock; user_table rows are low-volume per
-- the fractional-ordering rollout.
ALTER TABLE "user_table_rows" ALTER COLUMN "order_key" SET DATA TYPE text COLLATE "C";
Loading
Loading