-- Clean start
DROP TABLE IF EXISTS orders;
CREATE TABLE orders (
    order_id    BIGSERIAL PRIMARY KEY,
    order_date  DATE NOT NULL,
    amount      INT  NOT NULL
);

-- Create index we'll cluster by
CREATE INDEX idx_orders_order_date ON orders(order_date);

-- Insert 200k rows with RANDOM dates (scattered physical order)
INSERT INTO orders(order_date, amount)
SELECT
  DATE '2025-01-01' + (trunc(random() * 365))::int,
  (random() * 1000)::int
FROM generate_series(1, 200000);

ANALYZE orders;

-- 1) Show physical order BEFORE CLUSTER (ctid is physical location)
-- You'll see order_date jumping around even when ordered by ctid
SELECT ctid, order_date
FROM orders
ORDER BY ctid
LIMIT 20;

-- 2) Baseline plan BEFORE CLUSTER
-- Pick a narrow range that returns some rows
EXPLAIN (ANALYZE, BUFFERS)
SELECT *
FROM orders
WHERE order_date BETWEEN DATE '2025-06-01' AND DATE '2025-06-07';

-- Optional: check correlation BEFORE (usually low when inserted randomly)
SELECT
  attname,
  correlation
FROM pg_stats
WHERE schemaname = 'public'
  AND tablename  = 'orders'
  AND attname    = 'order_date';

-- 3) CLUSTER physically rewrites the table into index order
CLUSTER orders USING idx_orders_order_date;
ANALYZE orders;

-- Show physical order AFTER CLUSTER
-- Now order_date should look "monotonic-ish" when ordered by ctid
SELECT ctid, order_date
FROM orders
ORDER BY ctid
LIMIT 20;

-- Plan AFTER CLUSTER (often better: fewer heap page jumps)
EXPLAIN (ANALYZE, BUFFERS)
SELECT *
FROM orders
WHERE order_date BETWEEN DATE '2025-06-01' AND DATE '2025-06-07';

-- Correlation AFTER (usually much higher, closer to 1.0)
SELECT
  attname,
  correlation
FROM pg_stats
WHERE schemaname = 'public'
  AND tablename  = 'orders'
  AND attname    = 'order_date';

-- 4) Insert NEW DATA in a way that breaks the physical clustering
-- We insert "old" dates, but they will be appended to the end of the heap.
INSERT INTO orders(order_date, amount)
SELECT
  DATE '2025-01-01' + (trunc(random() * 30))::int,  -- January dates
  (random() * 1000)::int
FROM generate_series(1, 50000);

ANALYZE orders;

-- Show that "January" dates now appear at the end physically (ctid high)
SELECT ctid, order_date
FROM orders
ORDER BY ctid DESC
LIMIT 20;

-- 5) Plan AFTER new out-of-order data
-- Same query as before: planner may switch to Bitmap Heap Scan or even Seq Scan
-- and BUFFERS/read patterns often worsen.
EXPLAIN (ANALYZE, BUFFERS)
SELECT *
FROM orders
WHERE order_date BETWEEN DATE '2025-06-01' AND DATE '2025-06-07';

-- Correlation usually drops again after enough "out-of-order" inserts
SELECT
  attname,
  correlation
FROM pg_stats
WHERE schemaname = 'public'
  AND tablename  = 'orders'
  AND attname    = 'order_date';

--Refresh using a specific index (explicit)
CLUSTER orders USING idx_orders_order_date;
ANALYZE orders;