I told it to write a javascript function that I could run from Edge console that will build an array, send it to the gpu, sort it, and return the new order.
This is the code it produced:
/**
* Single-phase Bitonic Sort in WGSL, storing each record in 102 u32s.
*
* - We do multiple compute passes (one for each (k,j)) but all are recorded
* in a single command buffer, so we only call 'device.queue.submit()' once.
* - Each pass gets its own small uniform buffer with (Npow2, k, j).
*
* Usage:
* 1) Enable WebGPU (chrome://flags#enable-unsafe-webgpu).
* 2) Open DevTools, paste code, then:
* await demoBitonicSortSingleSubmit(1024);
*/
async function demoBitonicSortSingleSubmit(numRecords = 16) {
// --------------------------------------------------------------------------
// 1) Check for WebGPU
// --------------------------------------------------------------------------
if (!("gpu" in navigator)) {
console.error("WebGPU not available in this browser.");
return;
}
const adapter = await navigator.gpu.requestAdapter();
if (!adapter) {
console.error("Failed to get GPU adapter.");
return;
}
const device = await adapter.requestDevice();
// --------------------------------------------------------------------------
// 2) Generate random records, each occupying 102 u32s.
// --------------------------------------------------------------------------
const MAX_LENGTH_PER_RECORD = 100;
const RECORD_SLOT_SIZE = 1 + 1 + MAX_LENGTH_PER_RECORD; // 102 total
function createRandomRecord(recordID) {
// random length [1..100]
const length = Math.floor(Math.random() * MAX_LENGTH_PER_RECORD) + 1;
const arr = new Uint32Array(RECORD_SLOT_SIZE);
arr[0] = recordID; // ID
arr[1] = length; // length
for (let i = 0; i < length; i++) {
arr[2 + i] = Math.floor(Math.random() * 100);
}
return arr;
}
const records = [];
for (let i = 0; i < numRecords; i++) {
records.push(createRandomRecord(i + 1));
}
// --------------------------------------------------------------------------
// 3) Pad N up to next power of two
// --------------------------------------------------------------------------
function nextPowerOfTwo(x) {
return 1 << Math.ceil(Math.log2(x));
}
const N = numRecords;
const Npow2 = nextPowerOfTwo(N);
const totalCount = Npow2 * RECORD_SLOT_SIZE;
const combined = new Uint32Array(totalCount);
// Copy real records
for (let i = 0; i < N; i++) {
combined.set(records[i], i * RECORD_SLOT_SIZE);
}
// Fill the rest with sentinel
for (let i = N; i < Npow2; i++) {
const base = i * RECORD_SLOT_SIZE;
combined[base + 0] = 0xffffffff; // large ID
combined[base + 1] = MAX_LENGTH_PER_RECORD;
for (let j = 0; j < MAX_LENGTH_PER_RECORD; j++) {
combined[base + 2 + j] = 0xffffffff;
}
}
// --------------------------------------------------------------------------
// 4) Log "BEFORE" data (only the first N real records)
// --------------------------------------------------------------------------
console.log("----- BEFORE SORT -----");
for (let i = 0; i < N; i++) {
const base = i * RECORD_SLOT_SIZE;
const id = combined[base];
const length = combined[base + 1];
const arrPreview = Array.from(
combined.slice(base + 2, base + 2 + Math.min(length, 5))
);
console.log(
`Record ${i}: ID=${id}, length=${length}, data=[${arrPreview.join(", ")} ...]`
);
}
// --------------------------------------------------------------------------
// 5) Create GPU Buffers + Pipeline
// --------------------------------------------------------------------------
const bufferSizeBytes = combined.byteLength;
const dataBuffer = device.createBuffer({
size: bufferSizeBytes,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
});
device.queue.writeBuffer(dataBuffer, 0, combined);
const resultBuffer = device.createBuffer({
size: bufferSizeBytes,
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
});
// We won't have just 1 uniform buffer. Instead, each pass gets its own small buffer.
// We'll create them all up front (the total pass count is log2(Npow2) * log2(Npow2)/2).
// But for simplicity, we can just create them on the fly in the loop below and store references.
// --------------------------------------------------------------------------
// 6) The WGSL code
// --------------------------------------------------------------------------
const wgslCode = /* wgsl */`
struct Params {
n: u32,
k: u32,
j: u32,
}
@group(0) @binding(0) var<storage, read_write> data: array<u32>;
@group(0) @binding(1) var<uniform> params: Params;
const RECORD_SLOT_SIZE = 102u;
// Compare entire array (starting at data[2..]) lexicographically
fn compareArrays(iA: u32, iB: u32) -> i32 {
let baseA = iA * RECORD_SLOT_SIZE;
let baseB = iB * RECORD_SLOT_SIZE;
let lenA = data[baseA + 1u];
let lenB = data[baseB + 1u];
let minLen = min(lenA, lenB);
for (var idx = 0u; idx < minLen; idx = idx + 1u) {
let aVal = data[baseA + 2u + idx];
let bVal = data[baseB + 2u + idx];
if (aVal < bVal) {
return -1;
} else if (aVal > bVal) {
return 1;
}
}
// If all in common are the same, compare length
if (lenA < lenB) {
return -1;
} else if (lenA > lenB) {
return 1;
}
return 0;
}
fn swapRecords(iA: u32, iB: u32) {
let baseA = iA * RECORD_SLOT_SIZE;
let baseB = iB * RECORD_SLOT_SIZE;
for (var i = 0u; i < RECORD_SLOT_SIZE; i = i + 1u) {
let tmp = data[baseA + i];
data[baseA + i] = data[baseB + i];
data[baseB + i] = tmp;
}
}
@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) globalId: vec3<u32>) {
let i = globalId.x;
let n = params.n;
if (i >= n) {
return;
}
let kVal = params.k;
let jVal = params.j;
let partner = i ^ jVal;
// i < partner => no double-swap
if (partner < n && i < partner) {
let ascending = ((i & kVal) == 0u);
let cmp = compareArrays(i, partner);
if ((ascending && cmp > 0) || (!ascending && cmp < 0)) {
swapRecords(i, partner);
}
}
}
`;
const module = device.createShaderModule({ code: wgslCode });
const pipeline = device.createComputePipeline({
layout: device.createPipelineLayout({
bindGroupLayouts: [
device.createBindGroupLayout({
entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
],
}),
],
}),
compute: {
module,
entryPoint: "main",
},
});
// We'll keep a single "data" buffer binding, but we create one "Params" uniform buffer per pass.
// We'll build an array of [uniformBuffers, bindGroups].
// Then in the command encoder we do: pass.setBindGroup(0, bindGroups[passIndex]).
// That ensures each pass sees its correct (Npow2, k, j).
// We can precompute all (k, j) pairs.
const passPairs = [];
for (let k = 2; k <= Npow2; k <<= 1) {
for (let j = (k >> 1); j > 0; j >>= 1) {
passPairs.push({ k, j });
}
}
// We'll create a small uniform buffer for each pass
const passUniformBuffers = passPairs.map(({ k, j }) => {
const uBuf = device.createBuffer({
size: 3 * 4, // 3 u32
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
const arr = new Uint32Array([Npow2, k, j]);
// We'll write the data once, outside the main command buffer:
device.queue.writeBuffer(uBuf, 0, arr);
return uBuf;
});
// We'll create a list of bind groups
const passBindGroups = passUniformBuffers.map(uBuf => {
return device.createBindGroup({
layout: pipeline.getBindGroupLayout(0),
entries: [
{ binding: 0, resource: { buffer: dataBuffer } },
{ binding: 1, resource: { buffer: uBuf } },
],
});
});
// --------------------------------------------------------------------------
// 7) Encode all passes in a single command buffer, one pass per (k,j).
// --------------------------------------------------------------------------
console.log("----- SORTING STARTED -----");
const sortStartTime = performance.now();
const commandEncoder = device.createCommandEncoder();
for (let p = 0; p < passPairs.length; p++) {
const pass = commandEncoder.beginComputePass();
pass.setPipeline(pipeline);
pass.setBindGroup(0, passBindGroups[p]);
const workgroupSize = 64;
const numGroups = Math.ceil(Npow2 / workgroupSize);
pass.dispatchWorkgroups(numGroups);
pass.end();
}
device.queue.submit([commandEncoder.finish()]);
await device.queue.onSubmittedWorkDone();
const sortEndTime = performance.now();
console.log("----- SORTING COMPLETED -----");
console.log(`Bitonic Sort took ${(sortEndTime - sortStartTime).toFixed(3)} ms.`);
// --------------------------------------------------------------------------
// 8) Copy final sorted data to CPU-readable buffer
// --------------------------------------------------------------------------
{
const cmdEnc = device.createCommandEncoder();
cmdEnc.copyBufferToBuffer(dataBuffer, 0, resultBuffer, 0, bufferSizeBytes);
device.queue.submit([cmdEnc.finish()]);
await device.queue.onSubmittedWorkDone();
}
// --------------------------------------------------------------------------
// 9) Read back & parse
// --------------------------------------------------------------------------
await resultBuffer.mapAsync(GPUMapMode.READ);
const gpuData = new Uint32Array(resultBuffer.getMappedRange());
const finalData = gpuData.slice();
resultBuffer.unmap();
function extractRecord(i) {
const base = i * RECORD_SLOT_SIZE;
const id = finalData[base + 0];
const length = finalData[base + 1];
const arr = finalData.slice(base + 2, base + 2 + length);
return { recordID: id, array: Array.from(arr) };
}
const sortedRecords = [];
for (let i = 0; i < N; i++) {
sortedRecords.push(extractRecord(i));
}
// --------------------------------------------------------------------------
// 10) Log "AFTER" data
// --------------------------------------------------------------------------
console.log("----- AFTER SORT -----");
for (let i = 0; i < N; i++) {
const rec = sortedRecords[i];
const arrPreview = rec.array.slice(0, 5).join(", ");
console.log(
`Sorted ${i}: ID=${rec.recordID}, length=${rec.array.length}, data=[${arrPreview} ...]`
);
}
return sortedRecords;
}
// Example usage:
await demoBitonicSortSingleSubmit(16);