jacklangerman commited on 26 days ago

Commit

0f31e57

1 Parent(s): 465f2c6

4096-release (#1)

Browse files

- Release: S23DR 2026 learned baseline (HSS=0.382) (43975eb3f3e3ba3f9773ef8afda9a3d0e85b2f9c)

Files changed (37) hide show

.gitignore +5 -0
REPRODUCE.md +194 -0
checkpoint.pt +2 -2
configs/base.json +39 -0
repro_runs/compiled_repro_hss376/20260408_173614_64c7_4670_args.json +66 -0
repro_runs/compiled_repro_hss376/20260408_173614_64c7_4670_final.pt +3 -0
repro_runs/compiled_repro_hss376/20260408_194447_3061_6284_args.json +66 -0
repro_runs/compiled_repro_hss376/20260408_194447_3061_6284_final.pt +3 -0
repro_runs/compiled_repro_hss376/20260408_201237_4177_7208_args.json +66 -0
repro_runs/compiled_repro_hss376/20260408_201237_4177_7208_final.pt +3 -0
repro_runs/deterministic_hss372/20260330_025738_f0c9_3400_args.json +66 -0
repro_runs/deterministic_hss372/20260330_025738_f0c9_3400_final.pt +3 -0
repro_runs/deterministic_hss372/20260330_071030_8c95_3610_args.json +66 -0
repro_runs/deterministic_hss372/20260330_071030_8c95_3610_final.pt +3 -0
repro_runs/deterministic_hss372/20260330_073711_fdd2_8901_args.json +66 -0
repro_runs/deterministic_hss372/20260330_073711_fdd2_8901_final.pt +3 -0
repro_runs/e2e_repro4_hss379/20260329_213417_ef91_6503_args.json +66 -0
repro_runs/e2e_repro4_hss379/20260329_213417_ef91_6503_final.pt +3 -0
repro_runs/e2e_repro4_hss379/20260330_002648_ca92_4553_args.json +66 -0
repro_runs/e2e_repro4_hss379/20260330_002648_ca92_4553_final.pt +3 -0
repro_runs/e2e_repro4_hss379/20260330_005554_dec7_7390_args.json +66 -0
repro_runs/e2e_repro4_hss379/20260330_005554_dec7_7390_final.pt +3 -0
reproduce.sh +68 -0
reproduce_deterministic.sh +71 -0
s23dr_2026_example/attention.py +0 -85
s23dr_2026_example/cache_scenes.py +0 -195
s23dr_2026_example/color_mappings.py +0 -26
s23dr_2026_example/data.py +3 -13
s23dr_2026_example/losses.py +10 -106
s23dr_2026_example/make_sampled_cache.py +0 -185
s23dr_2026_example/model.py +4 -181
s23dr_2026_example/sinkhorn.py +0 -55
s23dr_2026_example/soft_hss_loss.py +0 -507
s23dr_2026_example/train.py +530 -0
s23dr_2026_example/varifold.py +9 -152
s23dr_2026_example/wire_varifold_kernels.py +2 -295
script.py +5 -5

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+runs/
+*.png
+*.log

REPRODUCE.md ADDED Viewed

	@@ -0,0 +1,194 @@

+# Reproducing the Best Checkpoint (HSS=0.382)
+## Quick Start
+The `checkpoint.pt` in this repo is the final model. To run inference:
+```bash
+python script.py
+```
+To reproduce from scratch (~3hr on 1x RTX 4090):
+```bash
+bash reproduce.sh
+```
+## Exact Recipe
+Architecture (unchanged across all 3 steps):
+```
+Perceiver: hidden=256, ff=1024, latent_tokens=256, latent_layers=7
+  encoder_layers=4, decoder_layers=3, cross_attn_interval=4
+  num_heads=4, kv_heads_cross=2, kv_heads_self=2
+  qk_norm=True (L2), rms_norm=True, dropout=0.1
+  segments=64, segment_param=midpoint_dir_len, segment_conf=True
+  behind_emb_dim=8, vote_features=True, activation=gelu
+```
+All shared config lives in `configs/base.json`.
+### Step 1: 2048 Phase 1 (from scratch) — ~1.5hr
+```
+Data:       hf://usm3d/s23dr-2026-sampled_2048_v2:train (16,508 samples)
+Steps:      0 -> 125,000 (242 epochs)
+LR:         3e-4, warmup=10,000
+Batch size: 32
+Optimizer:  AdamW, betas=(0.9, 0.95), weight_decay=0.01
+Sinkhorn:   eps=0.1, iters=20, dustbin=0.3
+Conf:       weight=0.1, mode=sinkhorn, head_wd=0.1
+Endpoint:   OFF
+Aug:        rotate=True, flip=True
+Seed:       353
+```
+Trains the perceiver from random init on 2048-point samples. The sinkhorn
+optimal transport loss learns to match predicted segments to ground truth.
+**Why 2048 first:** Training directly on 4096 overfits (1.47x train/val ratio
+vs 1.19x for 2048). The 2048 model learns better-generalized representations.
+**Output:** HSS ~0.28.
+### Step 2: 4096 finetune (constant LR) — ~15min
+```
+Resume:     Step 1 -> step125000.pt
+Data:       hf://usm3d/s23dr-2026-sampled_4096_v2:train (15,892 samples)
+Steps:      125,001 -> 135,000 (10k steps)
+LR:         3e-5 (constant, no cooldown)
+Batch size: 64
+Endpoint:   OFF
+```
+Switches input from 2048 to 4096 points, increasing structural coverage from
+66% to 74%. The gentle lr (3e-5) preserves learned representations while
+adapting to the extra input. Higher LR (>1e-4) causes catastrophic forgetting.
+HSS jumps from 0.28 to 0.35 in ~5k steps. Plateaus by 10k steps.
+**Output:** HSS ~0.35.
+### Step 3: Cooldown with endpoint loss — ~1hr
+```
+Resume:     Step 2 -> step135000.pt
+Data:       hf://usm3d/s23dr-2026-sampled_4096_v2:train
+Steps:      135,001 -> 170,000 (35k steps)
+LR:         3e-5, cooldown_start=150,000, cooldown_steps=20,000
+            (constant 3e-5 for 15k steps, then linear decay to ~0 over 20k)
+Batch size: 64
+Endpoint:   weight=0.1
+```
+Adds symmetric endpoint L1 loss (using detached sinkhorn assignment) to
+tighten vertex precision. The sinkhorn loss alone operates on segment
+midpoint/direction/length and doesn't directly penalize endpoint position error.
+**Output:** HSS=0.382, F1=0.414.
+### Key Numbers
+| Stage | Steps | HSS | F1 | What changed |
+|-------|-------|-----|-----|-------------|
+| After Step 1 | 125k | 0.281 | 0.156 | Learned geometry from 2048 pts |
+| After Step 2 | 135k | 0.351 | 0.190 | +74% coverage from 4096 pts |
+| After Step 3 | 170k | **0.382** | **0.411** | Vertex precision from endpoint loss |
+## Why This Works
+1. **2048 training has low overfitting** (1.19x train/val ratio) — the model
+   learns good representations without memorizing training samples.
+2. **4096 data has higher coverage ceiling** (74% vs 66% structural points) —
+   more of the building surface is observed, improving vertex recall.
+3. **Gentle finetuning preserves representations** — at lr=3e-5, the model
+   keeps its learned geometry understanding while adapting to the extra input.
+4. **Endpoint loss tightens vertices** — the symmetric endpoint distance
+   directly penalizes vertex position errors, which sinkhorn loss alone
+   doesn't do (it operates on midpoint/direction/length parametrization).
+## What Doesn't Work
+- **Training 4096 from scratch:** overfits (1.47x train/val gap), peaks at 0.346
+- **BuildingWorld pretraining:** representations are orthogonal to S23DR (cosine sim = 0.05)
+- **Mixed BW+S23DR training:** BW data hurts due to domain gap
+- **High dropout / weight decay:** prevents overfitting but causes underfitting
+- **High finetune LR (>1e-4):** catastrophic forgetting of 2048 representations
+- **Steeper cooldown (1e-5, 20x drop):** slightly worse than 3e-5 for this checkpoint
+## Reproduction Results
+### End-to-end reproductions
+| Model | HSS | F1 | IoU | Notes |
+|-------|-----|-----|-----|-------|
+| Original | 0.382 | 0.414 | 0.370 | Shipped checkpoint |
+| E2E repro #4 | 0.379 | 0.409 | 0.369 | Closest E2E, `repro_runs/e2e_repro4_hss379/` |
+| Compiled repro (from submission codebase) | 0.376 | — | — | Best compiled repro from this codebase, `repro_runs/compiled_repro_hss376/` |
+| E2E repro #3 | 0.375 | 0.404 | 0.367 | |
+| Deterministic E2E | 0.372 | 0.398 | 0.368 | Bit-reproducible, `repro_runs/deterministic_hss372/` |
+| E2E repro #5 | 0.349 | 0.373 | — | Outlier (early compile divergence) |
+### Partial reproductions (isolating pipeline stages)
+| Test | Starting from | HSS | Gap to original |
+|------|--------------|-----|-----------------|
+| Step 3 from orig Step 2 (run A) | Original step135000.pt | 0.382 | 0.000 |
+| Step 3 from orig Step 2 (run B) | Original step135000.pt | 0.384 | +0.002 |
+| Step 2+3 from orig Step 1 | Original step125000.pt | 0.377 | -0.005 |
+| Step 1 from orig step 100k | Original step100000.pt | 0.285 (Step 1 HSS) | +0.004 vs 0.281 |
+Step 3 from the same checkpoint reproduces to within 0.002. The E2E variance
+(0.349-0.379) is dominated by torch.compile nondeterminism in Step 1.
+### All benchmarks
+| Model | Input | HSS | F1 | IoU | Notes |
+|-------|-------|-----|-----|-----|-------|
+| Handcrafted baseline | raw views | 0.307 | 0.404 | 0.260 | |
+| h256+qk+ep (submitted) | 2048 | 0.365 | 0.388 | 0.360 | HSS=0.427 on test |
+| Original 3-step | 2048 | 0.373 | 0.404 | 0.363 | |
+| Original 3-step | 4096 | 0.382 | 0.414 | 0.370 | Best ever |
+| Step3 repro from orig S2 | 4096 | 0.384 | 0.414 | — | Near-exact repro |
+| E2E repro #4 | 4096 | 0.379 | 0.409 | 0.369 | |
+| Compiled repro (submission codebase) | 4096 | 0.376 | — | — | Best compiled from this exact codebase |
+| E2E repro #3 | 4096 | 0.375 | 0.404 | 0.367 | |
+| Deterministic E2E | 4096 | 0.372 | 0.398 | 0.368 | Bit-reproducible |
+## Code Equivalence Verification
+| Test | Result |
+|------|--------|
+| Forward pass (same checkpoint, same input) | Bit-identical (0.00 diff) |
+| Loss computation | Bit-identical (0.00 diff) |
+| Gradient computation | 5e-8 max diff |
+| Training from same seed | Bit-identical steps 1-44 |
+| Step 3 from same checkpoint (2 runs) | HSS=0.382, 0.384 |
+| Deterministic mode (2 runs) | Bit-identical (0.00 diff) |
+## Reproducibility Notes
+**Default mode** (`reproduce.sh`): Uses torch.compile (~3x faster). Each run
+gets different Triton kernels, causing ~1e-8 floating-point divergence at a
+random step (31-45). This grows through chaotic SGD dynamics, giving HSS
+variance of ~0.03 across runs. E2E reproductions land in the 0.349-0.379 range.
+**Deterministic mode** (`--deterministic` flag): Disables torch.compile.
+Bit-identical across runs with the same seed. HSS=0.372 (slightly lower than
+compiled mode because eager-mode kernels follow a different numerical path).
+**bad_samples.txt**: The shipped file has 156 entries to match original training.
+(Note: `wc -l` reports 155 because the last line lacks a trailing newline.)
+Two additional bad samples (`47b0e0ce19b`, `4b2d56eb3ef`) were discovered after
+the original training run. They are legitimately bad (misaligned GT) but were
+included in the original training data. Adding them changes the batch iteration
+order and costs ~0.005 HSS in deterministic mode (0.372 -> 0.367) and ~0.04 in
+compiled mode due to compounded torch.compile variance. Participants training
+from scratch may wish to add these 2 entries for cleaner training data, but
+should expect slightly different scores due to the changed iteration order.
+The shipped `checkpoint.pt` is from the original training run (HSS=0.382).

checkpoint.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc38a61ff512948b1dc92a30129d6efdd093f507948fc5b538050c4a38bfbf6c
-size 106460054

 version https://git-lfs.github.com/spec/v1
+oid sha256:1296423a1a2e603ba55860d8ef8fa3a861764a7bbc3de96b776fca59cf5b11ab
+size 106429791

configs/base.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "arch": "perceiver",
+    "segments": 64,
+    "hidden": 256,
+    "ff": 1024,
+    "num_heads": 4,
+    "kv_heads_cross": 2,
+    "kv_heads_self": 2,
+    "latent_tokens": 256,
+    "latent_layers": 7,
+    "decoder_layers": 3,
+    "cross_attn_interval": 4,
+    "encoder_layers": 4,
+    "behind_emb_dim": 8,
+    "dropout": 0.1,
+    "activation": "gelu",
+    "rms_norm": true,
+    "qk_norm": true,
+    "qk_norm_type": "l2",
+    "segment_param": "midpoint_dir_len",
+    "segment_conf": true,
+    "vote_features": true,
+    "adam_betas": "0.9,0.95",
+    "weight_decay": 0.01,
+    "warmup": 10000,
+    "varifold_weight": 0.0,
+    "sinkhorn_weight": 1.0,
+    "sinkhorn_eps": 0.1,
+    "sinkhorn_iters": 20,
+    "sinkhorn_dustbin": 0.3,
+    "conf_weight": 0.1,
+    "conf_mode": "sinkhorn",
+    "conf_head_wd": 0.1,
+    "aug_rotate": true,
+    "aug_flip": true,
+    "seed": 353
+}

repro_runs/compiled_repro_hss376/20260408_173614_64c7_4670_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 32,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_2048_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 0,
+  "cooldown_steps": 0,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": false,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.0,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "5b37dfc70c392936631b59d0bab24f20e4a2b0d9",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 0.0003,
+  "num_heads": 4,
+  "out_dir": "runs/validate_155_compiled",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 2048,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 125000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/compiled_repro_hss376/20260408_173614_64c7_4670_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e27c8ae20c291676a2c0b7e080d6d00be86f251ae6bdfe3cc3ff6f27f6646b00
+size 106427231

repro_runs/compiled_repro_hss376/20260408_194447_3061_6284_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 64,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_4096_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 0,
+  "cooldown_steps": 0,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": false,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.0,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "5b37dfc70c392936631b59d0bab24f20e4a2b0d9",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 3e-05,
+  "num_heads": 4,
+  "out_dir": "runs/validate_155_compiled",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "runs/validate_155_compiled/20260408_173614_64c7_4670/checkpoints/step125000.pt",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 4096,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 135000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/compiled_repro_hss376/20260408_194447_3061_6284_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89bae7e879900c128bbfe1a05e2f6d4b8430675ed95d47ea2d975b198c71cdad
+size 106429599

repro_runs/compiled_repro_hss376/20260408_201237_4177_7208_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 64,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_4096_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 150000,
+  "cooldown_steps": 20000,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": false,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.1,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "5b37dfc70c392936631b59d0bab24f20e4a2b0d9",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 3e-05,
+  "num_heads": 4,
+  "out_dir": "runs/validate_155_compiled",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "runs/validate_155_compiled/20260408_194447_3061_6284/checkpoints/step135000.pt",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 4096,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 170000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/compiled_repro_hss376/20260408_201237_4177_7208_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb5779baa160aab1f55b1c698cc39da6a72cae5c9bf5456a2b4063f2342b85a3
+size 106429599

repro_runs/deterministic_hss372/20260330_025738_f0c9_3400_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 32,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_2048_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 0,
+  "cooldown_steps": 0,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": true,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.0,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "465f2c6eb6ce4be5c2e52e8384961930f5f9f20a",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 0.0003,
+  "num_heads": 4,
+  "out_dir": "/workspace/s23dr_2026_example/repro_deterministic",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 2048,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 125000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/deterministic_hss372/20260330_025738_f0c9_3400_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bd05b2b323ced2ed94c4bd382f0e07ce5fae382f19ea35cb456ff631bcd1ac0
+size 106423583

repro_runs/deterministic_hss372/20260330_071030_8c95_3610_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 64,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_4096_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 0,
+  "cooldown_steps": 0,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": true,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.0,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "465f2c6eb6ce4be5c2e52e8384961930f5f9f20a",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 3e-05,
+  "num_heads": 4,
+  "out_dir": "/workspace/s23dr_2026_example/repro_deterministic",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "/workspace/s23dr_2026_example/repro_deterministic/20260330_025738_f0c9_3400/checkpoints/step125000.pt",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 4096,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 135000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/deterministic_hss372/20260330_071030_8c95_3610_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:992dbe9c7bfb4713b72d0f444f480f4a53e10844330d426d3fe0f367cdb96441
+size 106425951

repro_runs/deterministic_hss372/20260330_073711_fdd2_8901_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 64,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_4096_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 150000,
+  "cooldown_steps": 20000,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": true,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.1,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "465f2c6eb6ce4be5c2e52e8384961930f5f9f20a",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 3e-05,
+  "num_heads": 4,
+  "out_dir": "/workspace/s23dr_2026_example/repro_deterministic",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "/workspace/s23dr_2026_example/repro_deterministic/20260330_071030_8c95_3610/checkpoints/step135000.pt",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 4096,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 170000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/deterministic_hss372/20260330_073711_fdd2_8901_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7f508eb05e42ae70efb64fd8b3ab17d036000d49589b9122d4a1a2429c35db
+size 106425951

repro_runs/e2e_repro4_hss379/20260329_213417_ef91_6503_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 32,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_2048_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 0,
+  "cooldown_steps": 0,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": false,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.0,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "465f2c6eb6ce4be5c2e52e8384961930f5f9f20a",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 0.0003,
+  "num_heads": 4,
+  "out_dir": "/workspace/s23dr_2026_example/repro_e2e_run4",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 2048,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 125000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/e2e_repro4_hss379/20260329_213417_ef91_6503_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8ba72f11560b8164c87cc839e3070a37e024a2a618b7820d4819b739902aa2b
+size 106427231

repro_runs/e2e_repro4_hss379/20260330_002648_ca92_4553_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 64,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_4096_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 0,
+  "cooldown_steps": 0,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": false,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.0,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "465f2c6eb6ce4be5c2e52e8384961930f5f9f20a",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 3e-05,
+  "num_heads": 4,
+  "out_dir": "/workspace/s23dr_2026_example/repro_e2e_run4",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "/workspace/s23dr_2026_example/repro_e2e_run4/20260329_213417_ef91_6503/checkpoints/step125000.pt",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 4096,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 135000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/e2e_repro4_hss379/20260330_002648_ca92_4553_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d2f2f135f8de8b676f35e5fe2693c56c2ed060649967ff877e63385d607992f
+size 106429663

repro_runs/e2e_repro4_hss379/20260330_005554_dec7_7390_args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation": "gelu",
+  "adam_betas": "0.9,0.95",
+  "arch": "perceiver",
+  "args_from": "configs/base.json",
+  "aug_drop": 0.0,
+  "aug_flip": true,
+  "aug_jitter": 0.0,
+  "aug_rotate": true,
+  "batch_size": 64,
+  "behind_emb_dim": 8,
+  "cache_dir": "hf://usm3d/s23dr-2026-sampled_4096_v2:train",
+  "conf_clamp_min": null,
+  "conf_head_wd": 0.1,
+  "conf_mode": "sinkhorn",
+  "conf_weight": 0.1,
+  "cooldown_start": 150000,
+  "cooldown_steps": 20000,
+  "cosine_decay": false,
+  "cpu": false,
+  "cross_attn_interval": 4,
+  "decoder_input_xattn": false,
+  "decoder_layers": 3,
+  "deterministic": false,
+  "dropout": 0.1,
+  "ema_decay": 0.0,
+  "encoder_layers": 4,
+  "endpoint_warmup": 0,
+  "endpoint_weight": 0.1,
+  "ff": 1024,
+  "git_dirty": true,
+  "git_sha": "465f2c6eb6ce4be5c2e52e8384961930f5f9f20a",
+  "hidden": 256,
+  "kv_heads_cross": 2,
+  "kv_heads_self": 2,
+  "latent_layers": 7,
+  "latent_tokens": 256,
+  "learnable_fourier": false,
+  "length_floor": 0.0,
+  "lr": 3e-05,
+  "num_heads": 4,
+  "out_dir": "/workspace/s23dr_2026_example/repro_e2e_run4",
+  "pre_encoder_layers": 0,
+  "qk_norm": true,
+  "qk_norm_type": "l2",
+  "resume": "/workspace/s23dr_2026_example/repro_e2e_run4/20260330_002648_ca92_4553/checkpoints/step135000.pt",
+  "rms_norm": true,
+  "seed": 353,
+  "segment_conf": true,
+  "segment_param": "midpoint_dir_len",
+  "segments": 64,
+  "seq_len": 4096,
+  "sinkhorn_dustbin": 0.3,
+  "sinkhorn_eps": 0.1,
+  "sinkhorn_eps_schedule": "none",
+  "sinkhorn_eps_start": null,
+  "sinkhorn_iters": 20,
+  "sinkhorn_weight": 1.0,
+  "steps": 170000,
+  "val_cache_dir": "",
+  "varifold_cross_only": false,
+  "varifold_weight": 0.0,
+  "vote_features": true,
+  "warmup": 10000,
+  "weight_decay": 0.01
+}

repro_runs/e2e_repro4_hss379/20260330_005554_dec7_7390_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:161888591f9066bd2c3f42400839a46755bec5e72dc5f1245d7b3336c1a7ddc2
+size 106429663

reproduce.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/bin/bash
+# Reproduce the best checkpoint (HSS=0.382) from scratch.
+#
+# Three stages:
+#   1. Train on 2048-point data       (~1.5hr on 1x RTX 4090)
+#   2. Finetune on 4096-point data    (~15min)
+#   3. Cooldown with endpoint loss    (~1hr)
+#
+# Total: ~3hr on a single GPU (plus ~30min for compilation + data loading).
+# All shared config lives in configs/base.json.
+# Each step only specifies what changes.
+set -e
+OUT_DIR="${1:-runs}"
+BASE="--args-from configs/base.json"
+# ============================================================
+# Step 1: Train on 2048-point data (Phase 1)
+# ============================================================
+echo "=== Step 1: Training on 2048 data ==="
+python -m s23dr_2026_example.train $BASE \
+    --cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train \
+    --seq-len 2048 \
+    --lr 3e-4 \
+    --batch-size 32 \
+    --steps 125000 \
+    --out-dir "$OUT_DIR"
+STEP1_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
+echo "Step 1 complete: $STEP1_DIR"
+# ============================================================
+# Step 2: Finetune on 4096-point data
+# ============================================================
+echo "=== Step 2: Finetuning on 4096 data ==="
+python -m s23dr_2026_example.train $BASE \
+    --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
+    --resume "$STEP1_DIR/checkpoints/step125000.pt" \
+    --seq-len 4096 \
+    --lr 3e-5 \
+    --batch-size 64 \
+    --steps 135000 \
+    --out-dir "$OUT_DIR"
+STEP2_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
+echo "Step 2 complete: $STEP2_DIR"
+# ============================================================
+# Step 3: Cooldown with endpoint loss
+# ============================================================
+echo "=== Step 3: Cooldown with endpoint loss ==="
+python -m s23dr_2026_example.train $BASE \
+    --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
+    --resume "$STEP2_DIR/checkpoints/step135000.pt" \
+    --seq-len 4096 \
+    --lr 3e-5 \
+    --batch-size 64 \
+    --endpoint-weight 0.1 \
+    --cooldown-start 150000 \
+    --cooldown-steps 20000 \
+    --steps 170000 \
+    --out-dir "$OUT_DIR"
+STEP3_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
+echo "Step 3 complete: $STEP3_DIR"
+echo ""
+echo "Final checkpoint: $STEP3_DIR/checkpoints/final.pt"
+echo "Copy to checkpoint.pt for submission."

reproduce_deterministic.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/bin/bash
+# Reproduce the best checkpoint in deterministic mode (bit-reproducible).
+#
+# Same three stages as reproduce.sh, but with --deterministic:
+#   1. Train on 2048-point data       (~3hr on 1x RTX 4090)
+#   2. Finetune on 4096-point data    (~30min)
+#   3. Cooldown with endpoint loss    (~2hr)
+#
+# Total: ~5.5hr on a single GPU (no torch.compile, ~2x slower than reproduce.sh).
+# Deterministic mode disables torch.compile and forces CUDA deterministic ops.
+# Results are bit-identical across runs with the same seed. Expected HSS ~0.372.
+set -e
+OUT_DIR="${1:-runs}"
+BASE="--args-from configs/base.json"
+# ============================================================
+# Step 1: Train on 2048-point data (Phase 1)
+# ============================================================
+echo "=== Step 1: Training on 2048 data (deterministic) ==="
+python -m s23dr_2026_example.train $BASE \
+    --cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train \
+    --seq-len 2048 \
+    --lr 3e-4 \
+    --batch-size 32 \
+    --steps 125000 \
+    --deterministic \
+    --out-dir "$OUT_DIR"
+STEP1_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
+echo "Step 1 complete: $STEP1_DIR"
+# ============================================================
+# Step 2: Finetune on 4096-point data
+# ============================================================
+echo "=== Step 2: Finetuning on 4096 data (deterministic) ==="
+python -m s23dr_2026_example.train $BASE \
+    --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
+    --resume "$STEP1_DIR/checkpoints/step125000.pt" \
+    --seq-len 4096 \
+    --lr 3e-5 \
+    --batch-size 64 \
+    --steps 135000 \
+    --deterministic \
+    --out-dir "$OUT_DIR"
+STEP2_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
+echo "Step 2 complete: $STEP2_DIR"
+# ============================================================
+# Step 3: Cooldown with endpoint loss
+# ============================================================
+echo "=== Step 3: Cooldown with endpoint loss (deterministic) ==="
+python -m s23dr_2026_example.train $BASE \
+    --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
+    --resume "$STEP2_DIR/checkpoints/step135000.pt" \
+    --seq-len 4096 \
+    --lr 3e-5 \
+    --batch-size 64 \
+    --endpoint-weight 0.1 \
+    --cooldown-start 150000 \
+    --cooldown-steps 20000 \
+    --steps 170000 \
+    --deterministic \
+    --out-dir "$OUT_DIR"
+STEP3_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
+echo "Step 3 complete: $STEP3_DIR"
+echo ""
+echo "Final checkpoint: $STEP3_DIR/checkpoints/final.pt"
+echo "Copy to checkpoint.pt for submission."

s23dr_2026_example/attention.py CHANGED Viewed

@@ -139,88 +139,3 @@ class FeedForward(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.linear1(x)
         return self.linear2(self.activation(x))
-# =============================================================================
-# Custom Transformer Block
-# =============================================================================
-class TransformerBlock(nn.Module):
-    """
-    Single transformer block combining:
-      - multi-head SDPA (non-causal)
-      - layernorm + residual
-      - feed-forward MLP + residual
-    """
-    def __init__(
-        self,
-        d_model: int,
-        num_heads: int,
-        dim_ff: int,
-        dropout: float = 0.0,
-        activation: str = "gelu",
-        kv_heads: int = None,
-    ):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.attn = MultiHeadSDPA(d_model, num_heads, kv_heads=kv_heads)
-        self.dropout1 = nn.Dropout(dropout)
-        self.ffn = FeedForward(d_model, dim_ff, activation=activation)
-        self.dropout2 = nn.Dropout(dropout)
-    def forward(
-        self,
-        x: torch.Tensor,
-        memory: torch.Tensor,
-        memory_key_padding_mask: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        res = x
-        x = self.norm1(x)
-        x = self.attn(x, memory, key_padding_mask=memory_key_padding_mask)
-        x = res + self.dropout1(x)
-        res = x
-        x = self.norm2(x)
-        x = self.ffn(x)
-        return res + self.dropout2(x)
-class TransformerDecoderSets(nn.Module):
-    """
-    A stack of TransformerBlock layers for set-to-set
-    modeling without causal masks.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        num_heads: int,
-        dim_ff: int,
-        num_layers: int,
-        dropout: float = 0.0,
-        activation: str = "gelu",
-        kv_heads: int = None,
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList([
-            TransformerBlock(
-                d_model,
-                num_heads,
-                dim_ff,
-                dropout=dropout,
-                activation=activation,
-                kv_heads=kv_heads,
-            )
-            for _ in range(num_layers)
-        ])
-    def forward(
-        self,
-        tgt: torch.Tensor,
-        memory: torch.Tensor,
-        memory_key_padding_mask: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        for layer in self.layers:
-            tgt = layer(tgt, memory, memory_key_padding_mask=memory_key_padding_mask)
-        return tgt

     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.linear1(x)
         return self.linear2(self.activation(x))

s23dr_2026_example/cache_scenes.py CHANGED Viewed

@@ -23,24 +23,9 @@ Cache format per file (.pt):
 """
 from __future__ import annotations
-import sys
-from pathlib import Path as _Path
-if __package__ is None or __package__ == "":
-    _here = _Path(__file__).resolve().parent
-    if str(_here.parent) not in sys.path:
-        sys.path.insert(0, str(_here.parent))
-    __package__ = _here.name
-import argparse
-import time
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from pathlib import Path
 import numpy as np
-import torch
 from .point_fusion import (
-    FuserConfig, build_compact_scene,
     GEST_ID_TO_NAME, ADE_ID_TO_NAME, NUM_GEST,
 )
@@ -191,183 +176,3 @@ def _compute_smart_center_scale(xyz, source, mad_k=2.5, percentile=95.0,
     return center.astype(np.float32), np.float32(scale)
-def _process_one(sample, cfg):
-    """Process a single HF sample into a cache dict. Returns (order_id, dict) or None."""
-    rng = np.random.RandomState()  # worker-local rng
-    n_edges = len(sample.get("wf_edges", []))
-    if n_edges == 0 or n_edges > 64:
-        return None
-    scene = build_compact_scene(sample, cfg, rng=rng)
-    if scene is None:
-        return None
-    gt_v = scene.get("gt_vertices")
-    gt_e = scene.get("gt_edges")
-    if gt_v is None or gt_e is None or len(gt_e) == 0:
-        return None
-    xyz = scene["xyz"]
-    source = scene["source"]
-    visible_src = scene["visible_src"]
-    visible_id = scene["visible_id"]
-    behind_id = scene["behind_gest_id"]
-    group_id, class_id = _compute_group_and_class(
-        visible_src, visible_id, behind_id, source
-    )
-    center, scale = _compute_smart_center_scale(xyz, source)
-    order_id = sample.get("order_id", "unknown")
-    return order_id, {
-        "xyz": xyz.astype(np.float32),
-        "source": source.astype(np.uint8),
-        "group_id": group_id,
-        "class_id": class_id,
-        "behind_gest_id": behind_id.astype(np.int16),
-        "visible_src": visible_src.astype(np.uint8),
-        "visible_id": visible_id.astype(np.int16),
-        "n_views_voted": scene["n_views_voted"],
-        "vote_frac": scene["vote_frac"],
-        "center": center,
-        "scale": scale,
-        "gt_vertices": gt_v.astype(np.float32),
-        "gt_edges": gt_e.astype(np.int32),
-    }
-def main():
-    p = argparse.ArgumentParser(description="Cache compact scenes from HoHo22k")
-    g = p.add_mutually_exclusive_group(required=True)
-    g.add_argument("--data-dir", help="Local dir with shards")
-    g.add_argument("--streaming", action="store_true", help="Stream from HuggingFace")
-    p.add_argument("--out-dir", required=True, help="Output directory for .pt files")
-    p.add_argument("--limit", type=int, default=0)
-    p.add_argument("--depth-per-view", type=int, default=8000)
-    p.add_argument("--workers", type=int, default=0,
-                   help="Parallel workers (0=sequential)")
-    p.add_argument("--skip-existing", action="store_true",
-                   help="Skip samples whose .pt already exists in out-dir")
-    p.add_argument("--shard-start", type=int, default=0,
-                   help="First shard index (for parallel launches)")
-    p.add_argument("--shard-stride", type=int, default=1,
-                   help="Stride between shards (e.g. 8 means take every 8th shard)")
-    args = p.parse_args()
-    out_dir = Path(args.out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    existing_ids = set(p.stem for p in out_dir.glob("*.pt")) if args.skip_existing else set()
-    # Load dataset
-    from datasets import load_dataset
-    if args.streaming:
-        ds = load_dataset(
-            "usm3d/hoho22k_2026_trainval",
-            streaming=True, trust_remote_code=True, split="train",
-        )
-    else:
-        data_root = Path(args.data_dir).resolve()
-        tars = []
-        for candidate in [data_root / "data" / "train", data_root / "train", data_root]:
-            if candidate.exists():
-                tars = sorted(str(p) for p in candidate.glob("*.tar"))
-                if tars:
-                    break
-        loader = None
-        for c in [data_root / "hoho22k_2026_trainval.py"]:
-            if c.exists():
-                loader = c
-                break
-        if loader is None:
-            found = list(data_root.rglob("hoho22k_2026_trainval.py"))
-            loader = found[0] if found else None
-        if loader is None:
-            raise FileNotFoundError("Cannot find loader script")
-        # Shard-level parallelism: each process handles a slice of tars
-        if args.shard_stride > 1:
-            tars = tars[args.shard_start::args.shard_stride]
-            print(f"Shard slice: start={args.shard_start} stride={args.shard_stride} -> {len(tars)} shards")
-        ds = load_dataset(str(loader), data_files={"train": tars},
-                          streaming=True, trust_remote_code=True, split="train")
-    cfg = FuserConfig(depth_points_per_view=args.depth_per_view)
-    saved = 0
-    skipped = 0
-    t_start = time.perf_counter()
-    if args.workers > 0:
-        # Parallel: collect samples into batches, process in worker pool
-        # Note: HF streaming datasets can't be shared across workers, so we
-        # iterate in the main thread and dispatch processing to workers.
-        with ProcessPoolExecutor(max_workers=args.workers) as pool:
-            futures = {}
-            for i, sample in enumerate(ds):
-                if args.limit > 0 and i >= args.limit:
-                    break
-                oid = sample.get("order_id", "unknown")
-                if oid in existing_ids:
-                    skipped += 1
-                    continue
-                future = pool.submit(_process_one, sample, cfg)
-                futures[future] = i
-                # Drain completed futures to bound memory
-                if len(futures) >= args.workers * 4:
-                    done = [f for f in futures if f.done()]
-                    for f in done:
-                        result = f.result()
-                        del futures[f]
-                        if result is None:
-                            skipped += 1
-                            continue
-                        order_id, data = result
-                        torch.save(data, out_dir / f"{order_id}.pt")
-                        saved += 1
-                        if saved % 50 == 0:
-                            elapsed = time.perf_counter() - t_start
-                            print(f"Saved {saved} (skipped {skipped}) "
-                                  f"[{saved / elapsed:.1f} samples/s]")
-            # Drain remaining
-            for f in as_completed(futures):
-                result = f.result()
-                if result is None:
-                    skipped += 1
-                    continue
-                order_id, data = result
-                torch.save(data, out_dir / f"{order_id}.pt")
-                saved += 1
-    else:
-        # Sequential
-        for i, sample in enumerate(ds):
-            if args.limit > 0 and i >= args.limit:
-                break
-            oid = sample.get("order_id", "unknown")
-            if oid in existing_ids:
-                skipped += 1
-                continue
-            result = _process_one(sample, cfg)
-            if result is None:
-                skipped += 1
-                continue
-            order_id, data = result
-            torch.save(data, out_dir / f"{order_id}.pt")
-            saved += 1
-            if saved % 50 == 0:
-                elapsed = time.perf_counter() - t_start
-                print(f"Saved {saved} (skipped {skipped}) "
-                      f"[{saved / elapsed:.1f} samples/s]")
-    elapsed = time.perf_counter() - t_start
-    print(f"Done. Saved {saved}, skipped {skipped} in {elapsed:.0f}s "
-          f"({saved / elapsed:.1f} samples/s)")
-if __name__ == "__main__":
-    main()

 """
 from __future__ import annotations
 import numpy as np
 from .point_fusion import (
     GEST_ID_TO_NAME, ADE_ID_TO_NAME, NUM_GEST,
 )
     return center.astype(np.float32), np.float32(scale)

s23dr_2026_example/color_mappings.py CHANGED Viewed

@@ -181,29 +181,3 @@ ade20k_color_mapping = {
     'clock': (102, 255, 0),
     'flag': (92, 0, 255),
 }
-EDGE_CLASSES = {'cornice_return': 0,
-                'cornice_strip': 1,
-                'eave': 2,
-                'flashing': 3,
-                'hip': 4,
-                'rake': 5,
-                'ridge': 6,
-                'step_flashing': 7,
-                'transition_line': 8,
-                'valley': 9}
-EDGE_CLASSES_BY_ID = {v: k for k, v in EDGE_CLASSES.items()}
-edge_color_mapping = {
-    'cornice_return': (215, 62, 138),
-    'cornice_strip': (235, 88, 48),
-    'eave':  (54, 243, 63),
-    "flashing": (162, 162, 32),
-    'hip': (8, 89, 52),
-    'rake': (13, 94, 47),
-    'ridge': (214, 251, 248),
-    "step_flashing": (169, 255, 219),
-    'transition_line': (200,0,50),
-    'valley': (85, 27, 65),
-}

     'clock': (102, 255, 0),
     'flag': (92, 0, 255),
 }

s23dr_2026_example/data.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Data loading for pre-sampled HF datasets.
-Expects pre-sampled npz blobs with xyz_norm [2048, 3] (not full PCD).
 Use make_sampled_cache.py to produce these from full point clouds.
 """
 from __future__ import annotations
@@ -12,7 +13,7 @@ import torch
 from .tokenizer import EdgeDepthSequenceConfig
-# Default token budget (must match make_sampled_cache.py)
 SEQ_LEN = 2048
 COLMAP_POINTS = 1536
 DEPTH_POINTS = 512
@@ -130,9 +131,6 @@ def _process_sample(d, aug_rotate, aug_jitter=0.0, aug_drop=0.0, aug_flip=False)
         result["n_views_voted"] = torch.as_tensor(d["n_views_voted"], dtype=torch.float32)
     if "vote_frac" in d:
         result["vote_frac"] = torch.as_tensor(d["vote_frac"], dtype=torch.float32)
-    if "gt_edge_classes" in d:
-        result["gt_edge_classes"] = torch.as_tensor(
-            np.asarray(d["gt_edge_classes"], dtype=np.int64), dtype=torch.long)
     return result
@@ -161,14 +159,6 @@ def collate(batch):
                     f"Field '{field}' present in some batch samples but missing in "
                     f"{len(missing)}/{len(batch)}. Mixed data versions in cache?")
             out[field] = torch.stack([d[field] for d in batch])
-    # gt_edge_classes: variable length per sample (like gt_segments), keep as list
-    if any("gt_edge_classes" in d for d in batch):
-        missing = [i for i, d in enumerate(batch) if "gt_edge_classes" not in d]
-        if missing:
-            raise KeyError(
-                f"Field 'gt_edge_classes' present in some batch samples but missing in "
-                f"{len(missing)}/{len(batch)}. Mixed data versions in cache?")
-        out["gt_edge_classes"] = [d["gt_edge_classes"] for d in batch]
     return out

 """Data loading for pre-sampled HF datasets.
+Expects pre-sampled npz blobs with xyz_norm (not full PCD).
+Supports both 2048-point and 4096-point datasets.
 Use make_sampled_cache.py to produce these from full point clouds.
 """
 from __future__ import annotations
 from .tokenizer import EdgeDepthSequenceConfig
+# Default token budget (for 2048-point datasets; 4096 uses 3072/1024)
 SEQ_LEN = 2048
 COLMAP_POINTS = 1536
 DEPTH_POINTS = 512
         result["n_views_voted"] = torch.as_tensor(d["n_views_voted"], dtype=torch.float32)
     if "vote_frac" in d:
         result["vote_frac"] = torch.as_tensor(d["vote_frac"], dtype=torch.float32)
     return result
                     f"Field '{field}' present in some batch samples but missing in "
                     f"{len(missing)}/{len(batch)}. Mixed data versions in cache?")
             out[field] = torch.stack([d[field] for d in batch])
     return out

s23dr_2026_example/losses.py CHANGED Viewed

@@ -5,7 +5,6 @@ import torch
 from .varifold import varifold_loss_batch
 from .sinkhorn import batched_sinkhorn_loss
-from .soft_hss_loss import batched_sinkhorn_vertex_f1, batched_soft_hss_v2
 # Varifold config
 VARIANT = "simpson3"
@@ -18,21 +17,12 @@ VARIFOLD_CROSS_ONLY = False  # Set to True to drop self-energy (avoids O(S^2) bl
 SINKHORN_EPS = 0.05
 SINKHORN_ITERS = 10
-# Distance thresholds in meters (divided by per-scene scale at runtime)
-VERTEX_THRESH_M = 0.5      # vertex match threshold (mirrors real HSS)
-TUBE_RADIUS_M = 0.5        # tube IoU radius (mirrors real HSS)
 # Sinkhorn dustbin cost: controls the OT "not matching" penalty.
 # Like tau, this is an OT behavior parameter, NOT a physical distance.
 # Must be comparable to typical matching costs in normalized space (~0.1).
 # Do NOT divide by scale.
 SINKHORN_DUSTBIN = 0.1
-# Sigmoid temperature: controls gradient smoothness, NOT a distance threshold.
-# Must stay large enough in normalized space to provide useful gradients.
-# Do NOT divide by scale (unlike the thresholds above).
-SIGMOID_TAU = 0.05
 MAX_GT = 64  # fixed pad size for compile-friendly shapes
 # Precomputed constants (created once on first call)
@@ -65,7 +55,7 @@ def pad_gt_fixed(gt_list, device, dtype):
 def _loss_inner(pred_segments, gt_pad, gt_mask, gt_lengths, scales,
-                sigmas, alphas, varifold_w, vertex_f1_w):
     """Pure tensor loss -- no Python control flow, no boolean indexing."""
     has_gt = (gt_lengths > 0).float()
@@ -78,77 +68,36 @@ def _loss_inner(pred_segments, gt_pad, gt_mask, gt_lengths, scales,
     v = loss_batch / gt_lengths.clamp(min=1.0)
     v = (v * has_gt).sum() / has_gt.sum().clamp(min=1.0)
-    thresh = VERTEX_THRESH_M / scales
-    f1 = batched_sinkhorn_vertex_f1(
-        pred_segments, gt_pad, gt_mask, thresh=thresh, tau=SIGMOID_TAU)
-    f1 = (f1 * has_gt).sum() / has_gt.sum().clamp(min=1.0)
-    total = varifold_w * v + vertex_f1_w * f1
-    return total, v, f1
 # Will be replaced with compiled version on CUDA
 _loss_fn = _loss_inner
-def _conf_match_loss(pred_segments, gt_pad, gt_mask, conf_logits, scales):
-    """Auxiliary BCE loss: train conf to predict whether each segment matches GT.
-    Computes per-segment min-distance to GT, creates soft match target via
-    sigmoid thresholding, and returns BCE(sigmoid(conf), target).
-    """
-    B, S = pred_segments.shape[:2]
-    # Decoupled cost: midpoint + direction + length (same as sinkhorn)
-    p0, p1 = pred_segments[:, :, 0], pred_segments[:, :, 1]
-    g0, g1 = gt_pad[:, :, 0], gt_pad[:, :, 1]
-    mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)
-    mid_g, half_g = 0.5 * (g0 + g1), 0.5 * (g1 - g0)
-    d_mid = torch.linalg.norm(mid_p.unsqueeze(2) - mid_g.unsqueeze(1), dim=-1)
-    len_p = torch.linalg.norm(half_p, dim=-1, keepdim=True).clamp(min=1e-6)
-    len_g = torch.linalg.norm(half_g, dim=-1, keepdim=True).clamp(min=1e-6)
-    dir_p = half_p / len_p
-    dir_g = half_g / len_g
-    cos_angle = (dir_p.unsqueeze(2) * dir_g.unsqueeze(1)).sum(dim=-1)
-    d_dir = 1.0 - cos_angle.abs()
-    d_len = (len_p.unsqueeze(2) - len_g.unsqueeze(1)).squeeze(-1).abs()
-    cost = d_mid + d_dir + d_len  # [B, S, M]
-    # Mask invalid GT with high cost
-    cost = torch.where(gt_mask.unsqueeze(1), cost, cost.new_tensor(1e6))
-    min_dist = cost.min(dim=2).values  # [B, S]
-    # Soft target: sigmoid((thresh - dist) / tau), in normalized space
-    thresh = VERTEX_THRESH_M / scales  # [B]
-    target = torch.sigmoid((thresh[:, None] - min_dist) / SIGMOID_TAU)
-    return torch.nn.functional.binary_cross_entropy_with_logits(
-        conf_logits, target.detach(), reduction="mean")
 def compute_loss(pred_segments, gt_list, scales, device,
-                 varifold_w, sinkhorn_w, vertex_f1_w=0.0, soft_hss_w=0.0,
                  endpoint_w=0.0,
-                 conf_logits=None, conf_weight=0.0, conf_mode="match",
                  sinkhorn_eps=None, sinkhorn_iters=None,
                  sinkhorn_dustbin=None, conf_clamp_min=None):
     """Combined loss with fixed-size GT padding.
-    conf_mode: "match" = BCE matching supervision, "sinkhorn" = conf-weighted sinkhorn.
     """
     if conf_logits is not None and conf_clamp_min is not None:
         conf_logits = conf_logits.clamp(min=conf_clamp_min)
     gt_pad, gt_mask, gt_lengths = pad_gt_fixed(gt_list, device, pred_segments.dtype)
     c = _get_loss_constants(device, pred_segments.dtype)
-    total, v, f1 = _loss_fn(
         pred_segments, gt_pad, gt_mask, gt_lengths, scales,
-        c["sigmas"], c["alphas"], varifold_w, vertex_f1_w)
     terms = {}
     if varifold_w > 0:
         terms["varifold"] = v.detach()
-    if vertex_f1_w > 0:
-        terms["vertex_f1"] = f1.detach()
     if sinkhorn_w > 0:
         has_gt = (gt_lengths > 0).float()
@@ -171,28 +120,8 @@ def compute_loss(pred_segments, gt_list, scales, device,
         total = total + sinkhorn_w * s
         terms["sinkhorn"] = s.detach()
-    if soft_hss_w > 0:
-        has_gt = (gt_lengths > 0).float()
-        vert_thresh = VERTEX_THRESH_M / scales
-        edge_thresh = TUBE_RADIUS_M / scales
-        hss_loss = batched_soft_hss_v2(
-            pred_segments, gt_pad, gt_mask,
-            vert_thresh=vert_thresh, edge_thresh=edge_thresh, tau=SIGMOID_TAU)
-        hs = (hss_loss * has_gt).sum() / has_gt.sum().clamp(min=1.0)
-        total = total + soft_hss_w * hs
-        terms["soft_hss"] = hs.detach()
     if conf_logits is not None and conf_weight > 0:
-        if conf_mode == "match":
-            # Explicit BCE supervision from nearest-GT distances
-            cl = _conf_match_loss(pred_segments, gt_pad, gt_mask, conf_logits, scales)
-            total = total + conf_weight * cl
-            terms["conf"] = cl.detach()
-        elif conf_mode in ("sinkhorn", "sinkhorn_detach"):
-            # Conf trained through sinkhorn transport gradients (via pred_mass).
-            # sinkhorn_detach: pred_mass uses detached conf, so OT can't push conf negative.
-            # Add count regularizer to prevent all-zero conf collapse.
-            # Normalized by S so magnitude doesn't depend on segment count.
             conf_w = torch.sigmoid(conf_logits)
             S = conf_logits.shape[1]
             gt_counts = gt_mask.sum(dim=1).float()
@@ -200,29 +129,6 @@ def compute_loss(pred_segments, gt_list, scales, device,
             reg = (((conf_sum - gt_counts) / S) ** 2).mean()
             total = total + conf_weight * reg
             terms["conf_reg"] = reg.detach()
-        elif conf_mode == "varifold":
-            # Conf-weighted varifold: weight each pred segment's contribution
-            # by sigmoid(conf). Low-conf segments contribute less to the loss.
-            # Needs regularizer to prevent all-zero conf collapse.
-            has_gt = (gt_lengths > 0).float()
-            conf_w = torch.sigmoid(conf_logits)  # [B, S]
-            sigmas_eff = c["sigmas"] / scales[:, None]
-            vf_conf = varifold_loss_batch(
-                pred_segments, gt_pad, gt_mask=gt_mask,
-                variant=VARIANT, sigmas=sigmas_eff, alpha=c["alphas"],
-                len_pow=LEN_POW, pred_weights=conf_w,
-            )
-            vc = (vf_conf / gt_lengths.clamp(min=1.0))
-            vc = (vc * has_gt).sum() / has_gt.sum().clamp(min=1.0)
-            # Regularizer: penalize total conf being far from n_gt
-            # Normalized by S so magnitude doesn't depend on segment count
-            S = conf_logits.shape[1]
-            gt_counts = gt_mask.sum(dim=1).float()  # [B]
-            conf_sum = conf_w.sum(dim=1)  # [B]
-            reg = (((conf_sum - gt_counts) / S) ** 2).mean()
-            total = total + conf_weight * vc + 0.01 * reg
-            terms["conf_vf"] = vc.detach()
-            terms["conf_reg"] = reg.detach()
         else:
             raise ValueError(f"Unknown conf_mode: {conf_mode}")
@@ -234,14 +140,12 @@ def compute_loss(pred_segments, gt_list, scales, device,
         B, S = pred_segments.shape[:2]
         M = gt_pad.shape[1]
-        # Compute hard assignment via sinkhorn (detached — matching is not trained)
         with torch.no_grad():
             pred_mass_ep = torch.sigmoid(conf_logits) if conf_logits is not None else None
             sink_loss_for_assign = batched_sinkhorn_loss(
                 pred_segments, gt_pad, gt_mask, eps_ep, iters_ep, dustbin_ep,
                 pred_mass=pred_mass_ep)
-            # Re-run sinkhorn to get transport matrix for assignment
-            # (reuse the cost computation from batched_sinkhorn_loss internals)
             p0, p1 = pred_segments[:, :, 0], pred_segments[:, :, 1]
             g0, g1 = gt_pad[:, :, 0], gt_pad[:, :, 1]
             mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)

 from .varifold import varifold_loss_batch
 from .sinkhorn import batched_sinkhorn_loss
 # Varifold config
 VARIANT = "simpson3"
 SINKHORN_EPS = 0.05
 SINKHORN_ITERS = 10
 # Sinkhorn dustbin cost: controls the OT "not matching" penalty.
 # Like tau, this is an OT behavior parameter, NOT a physical distance.
 # Must be comparable to typical matching costs in normalized space (~0.1).
 # Do NOT divide by scale.
 SINKHORN_DUSTBIN = 0.1
 MAX_GT = 64  # fixed pad size for compile-friendly shapes
 # Precomputed constants (created once on first call)
 def _loss_inner(pred_segments, gt_pad, gt_mask, gt_lengths, scales,
+                sigmas, alphas, varifold_w):
     """Pure tensor loss -- no Python control flow, no boolean indexing."""
     has_gt = (gt_lengths > 0).float()
     v = loss_batch / gt_lengths.clamp(min=1.0)
     v = (v * has_gt).sum() / has_gt.sum().clamp(min=1.0)
+    total = varifold_w * v
+    return total, v
 # Will be replaced with compiled version on CUDA
 _loss_fn = _loss_inner
 def compute_loss(pred_segments, gt_list, scales, device,
+                 varifold_w, sinkhorn_w,
                  endpoint_w=0.0,
+                 conf_logits=None, conf_weight=0.0, conf_mode="sinkhorn",
                  sinkhorn_eps=None, sinkhorn_iters=None,
                  sinkhorn_dustbin=None, conf_clamp_min=None):
     """Combined loss with fixed-size GT padding.
+    conf_mode: "sinkhorn" = conf-weighted sinkhorn, "sinkhorn_detach" = detached conf.
     """
     if conf_logits is not None and conf_clamp_min is not None:
         conf_logits = conf_logits.clamp(min=conf_clamp_min)
     gt_pad, gt_mask, gt_lengths = pad_gt_fixed(gt_list, device, pred_segments.dtype)
     c = _get_loss_constants(device, pred_segments.dtype)
+    total, v = _loss_fn(
         pred_segments, gt_pad, gt_mask, gt_lengths, scales,
+        c["sigmas"], c["alphas"], varifold_w)
     terms = {}
     if varifold_w > 0:
         terms["varifold"] = v.detach()
     if sinkhorn_w > 0:
         has_gt = (gt_lengths > 0).float()
         total = total + sinkhorn_w * s
         terms["sinkhorn"] = s.detach()
     if conf_logits is not None and conf_weight > 0:
+        if conf_mode in ("sinkhorn", "sinkhorn_detach"):
             conf_w = torch.sigmoid(conf_logits)
             S = conf_logits.shape[1]
             gt_counts = gt_mask.sum(dim=1).float()
             reg = (((conf_sum - gt_counts) / S) ** 2).mean()
             total = total + conf_weight * reg
             terms["conf_reg"] = reg.detach()
         else:
             raise ValueError(f"Unknown conf_mode: {conf_mode}")
         B, S = pred_segments.shape[:2]
         M = gt_pad.shape[1]
+        # Compute hard assignment via sinkhorn (detached -- matching is not trained)
         with torch.no_grad():
             pred_mass_ep = torch.sigmoid(conf_logits) if conf_logits is not None else None
             sink_loss_for_assign = batched_sinkhorn_loss(
                 pred_segments, gt_pad, gt_mask, eps_ep, iters_ep, dustbin_ep,
                 pred_mass=pred_mass_ep)
             p0, p1 = pred_segments[:, :, 0], pred_segments[:, :, 1]
             g0, g1 = gt_pad[:, :, 0], gt_pad[:, :, 1]
             mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)

s23dr_2026_example/make_sampled_cache.py CHANGED Viewed

@@ -24,21 +24,7 @@ the same points. Fine for now; better augmentation can be added later.
 """
 from __future__ import annotations
-import sys
-from pathlib import Path as _Path
-if __package__ is None or __package__ == "":
-    _here = _Path(__file__).resolve().parent
-    if str(_here.parent) not in sys.path:
-        sys.path.insert(0, str(_here.parent))
-    __package__ = _here.name
-import argparse
-import io
-import time
-from pathlib import Path
 import numpy as np
-import torch
 # Priority sampling (same logic as train.py)
@@ -87,174 +73,3 @@ def _priority_sample(source, group_id, seq_len, colmap_quota, depth_quota):
     return indices[:seq_len], mask
-def process_sample(xyz, source, group_id, class_id, vis_src, vis_id,
-                   center, scale, gt_v, gt_e, behind=None,
-                   n_views_voted=None, vote_frac=None,
-                   gt_edge_classes=None,
-                   seq_len=2048, colmap_q=1536, depth_q=512):
-    """Sample and normalize one scene. Returns dict of numpy arrays."""
-    indices, mask = _priority_sample(source, group_id, seq_len, colmap_q, depth_q)
-    xyz_norm = ((xyz[indices] - center) / scale).astype(np.float32)
-    gt_seg = np.stack([gt_v[gt_e[:, 0]], gt_v[gt_e[:, 1]]], axis=1)
-    gt_seg_norm = ((gt_seg - center) / scale).astype(np.float32)
-    result = {
-        "xyz_norm": xyz_norm,
-        "class_id": class_id[indices].astype(np.uint8),
-        "source": source[indices].astype(np.uint8),
-        "mask": mask,
-        "gt_segments": gt_seg_norm,
-        "scale": np.float32(scale),
-        "center": center.astype(np.float32),
-        "gt_vertices": gt_v.astype(np.float32),
-        "gt_edges": gt_e.astype(np.int32),
-        "visible_src": vis_src[indices].astype(np.uint8),
-        "visible_id": vis_id[indices].astype(np.int16),
-    }
-    if behind is not None:
-        result["behind"] = behind[indices].astype(np.int16)
-    if n_views_voted is not None:
-        result["n_views_voted"] = n_views_voted[indices].astype(np.uint8)
-    if vote_frac is not None:
-        result["vote_frac"] = vote_frac[indices].astype(np.float32)
-    if gt_edge_classes is not None:
-        if len(gt_edge_classes) != len(gt_e):
-            raise ValueError(
-                f"gt_edge_classes length {len(gt_edge_classes)} != "
-                f"gt_edges length {len(gt_e)}")
-        result["gt_edge_classes"] = gt_edge_classes.astype(np.int64)
-    return result
-def _load_edge_classes(path):
-    """Load edge classifications lookup from npz file."""
-    if path is None:
-        return None
-    path = Path(path)
-    if not path.exists():
-        raise FileNotFoundError(f"Edge classifications file not found: {path}")
-    data = np.load(str(path), allow_pickle=False)
-    lookup = {k: data[k] for k in data.files}
-    print(f"Loaded edge classifications for {len(lookup)} orders from {path}")
-    return lookup
-def main():
-    p = argparse.ArgumentParser()
-    g = p.add_mutually_exclusive_group(required=True)
-    g.add_argument("--in-dir", help="Local directory of .pt files")
-    g.add_argument("--hf-repo", help="HuggingFace dataset repo (e.g. usm3d/s23dr-2026-cached_full_pcd)")
-    p.add_argument("--split", default="train", help="HF dataset split")
-    p.add_argument("--out-dir", required=True)
-    p.add_argument("--edge-classes", default=None,
-                   help="Path to edge_classifications.npz from extract_edge_classes.py")
-    p.add_argument("--seq-len", type=int, default=2048)
-    p.add_argument("--colmap-quota", type=int, default=1536)
-    p.add_argument("--depth-quota", type=int, default=512)
-    p.add_argument("--seed", type=int, default=7)
-    args = p.parse_args()
-    out_dir = Path(args.out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    np.random.seed(args.seed)
-    edge_cls_lookup = _load_edge_classes(args.edge_classes)
-    n_edge_matched, n_edge_missing = 0, 0
-    t_start = time.perf_counter()
-    done = 0
-    if args.in_dir:
-        # Local .pt files
-        files = sorted(Path(args.in_dir).glob("*.pt"))
-        print(f"Converting {len(files)} local .pt files...")
-        for f in files:
-            out_f = out_dir / (f.stem + ".npz")
-            if out_f.exists():
-                done += 1
-                continue
-            d = torch.load(f, weights_only=False)
-            behind = np.asarray(d["behind_gest_id"], np.int16) if "behind_gest_id" in d else None
-            n_vv = np.asarray(d["n_views_voted"], np.uint8) if "n_views_voted" in d else None
-            vf = np.asarray(d["vote_frac"], np.float32) if "vote_frac" in d else None
-            gt_ec = None
-            if edge_cls_lookup is not None:
-                order_id = f.stem
-                if order_id in edge_cls_lookup:
-                    gt_ec = edge_cls_lookup[order_id]
-                    n_edge_matched += 1
-                else:
-                    n_edge_missing += 1
-            result = process_sample(
-                np.asarray(d["xyz"], np.float32),
-                np.asarray(d["source"], np.uint8),
-                np.asarray(d["group_id"], np.int8),
-                np.asarray(d["class_id"], np.uint8),
-                np.asarray(d["visible_src"], np.uint8),
-                np.asarray(d["visible_id"], np.int16),
-                np.asarray(d["center"], np.float32),
-                float(d["scale"]),
-                np.asarray(d["gt_vertices"], np.float32),
-                np.asarray(d["gt_edges"], np.int32),
-                behind=behind, n_views_voted=n_vv, vote_frac=vf,
-                gt_edge_classes=gt_ec,
-                seq_len=args.seq_len, colmap_q=args.colmap_quota, depth_q=args.depth_quota,
-            )
-            np.savez(out_f, **result)
-            done += 1
-            if done % 2000 == 0:
-                print(f"  {done}/{len(files)} [{done/(time.perf_counter()-t_start):.0f}/s]")
-    else:
-        # HF dataset
-        from datasets import load_dataset
-        print(f"Loading {args.hf_repo} split={args.split}...")
-        ds = load_dataset(args.hf_repo, split=args.split)
-        print(f"Converting {len(ds)} samples...")
-        for i, sample in enumerate(ds):
-            order_id = sample["order_id"]
-            out_f = out_dir / f"{order_id}.npz"
-            if out_f.exists():
-                done += 1
-                continue
-            arrays = np.load(io.BytesIO(sample["data"]))
-            behind = arrays["behind_gest_id"] if "behind_gest_id" in arrays else None
-            n_vv = arrays["n_views_voted"] if "n_views_voted" in arrays else None
-            vf = arrays["vote_frac"] if "vote_frac" in arrays else None
-            gt_ec = None
-            if edge_cls_lookup is not None:
-                if order_id in edge_cls_lookup:
-                    gt_ec = edge_cls_lookup[order_id]
-                    n_edge_matched += 1
-                else:
-                    n_edge_missing += 1
-            result = process_sample(
-                arrays["xyz"], arrays["source"], arrays["group_id"],
-                arrays["class_id"], arrays["visible_src"], arrays["visible_id"],
-                arrays["center"], float(arrays["scale"]),
-                arrays["gt_vertices"], arrays["gt_edges"],
-                behind=behind, n_views_voted=n_vv, vote_frac=vf,
-                gt_edge_classes=gt_ec,
-                seq_len=args.seq_len, colmap_q=args.colmap_quota, depth_q=args.depth_quota,
-            )
-            np.savez(out_f, **result)
-            done += 1
-            if done % 2000 == 0:
-                print(f"  {done}/{len(ds)} [{done/(time.perf_counter()-t_start):.0f}/s]")
-    elapsed = time.perf_counter() - t_start
-    print(f"Done: {done} files in {elapsed:.0f}s ({done/max(1,elapsed):.0f}/s)")
-    if edge_cls_lookup is not None:
-        print(f"Edge classifications: {n_edge_matched} matched, {n_edge_missing} missing")
-    # Report sizes
-    import os
-    npz_files = list(out_dir.glob("*.npz"))
-    if npz_files:
-        sizes = [os.path.getsize(f) for f in npz_files[:100]]
-        print(f"Avg file size: {np.mean(sizes)/1024:.0f}KB")
-        print(f"Est total: {np.mean(sizes)*len(npz_files)/1e9:.1f}GB")
-if __name__ == "__main__":
-    main()

 """
 from __future__ import annotations
 import numpy as np
 # Priority sampling (same logic as train.py)
     return indices[:seq_len], mask

s23dr_2026_example/model.py CHANGED Viewed

@@ -239,7 +239,6 @@ class TokenTransformerSegments(nn.Module):
         self.segments = segments
         self.out_vertices = segments * 2
         self.segment_param = segment_param
-        self.length_floor = length_floor
         self.decoder_input_xattn = decoder_input_xattn
         norm_class = norm_class or nn.LayerNorm
@@ -428,167 +427,7 @@ class SelfAttentionEncoderLayer(nn.Module):
 # ---------------------------------------------------------------------------
-# Vanilla transformer: self-attention encoder + segment query decoder
-# ---------------------------------------------------------------------------
-class TransformerSegments(nn.Module):
-    """Standard transformer encoder + cross-attention segment decoder.
-    Architecture:
-        Input tokens [B, T, D]
-            |
-            v
-        input_proj: Linear -> GELU -> Linear -> Norm   =>  [B, T, hidden]
-            |
-            v
-        N SelfAttentionEncoderLayers (self-attn over all T tokens)
-            |
-            v
-        Segment decoder (same as Perceiver version):
-            M SegmentDecoderLayers (queries cross-attend to encoded tokens)
-            |
-            v
-        segment_head -> endpoints [B, S, 2, 3]  (midpoint_halfvec or midpoint_dir_len)
-    """
-    def __init__(
-        self,
-        segments: int = 32,
-        in_dim: int = 128,
-        hidden: int = 128,
-        num_heads: int = 4,
-        kv_heads_cross: int | None = 2,
-        kv_heads_self: int | None = 0,
-        dim_feedforward: int = 256,
-        dropout: float = 0.01,
-        encoder_layers: int = 4,
-        decoder_layers: int = 2,
-        norm_class=None,
-        activation: str = "gelu",
-        segment_conf: bool = False,
-        segment_param: str = "midpoint_halfvec",
-        length_floor: float = 0.0,
-        decoder_input_xattn: bool = False,
-        qk_norm: bool = False,
-        qk_norm_type: str = "l2",
-    ):
-        super().__init__()
-        self.segments = segments
-        self.out_vertices = segments * 2
-        self.segment_param = segment_param
-        self.length_floor = length_floor
-        norm_class = norm_class or nn.LayerNorm
-        if kv_heads_cross is not None and kv_heads_cross <= 0:
-            kv_heads_cross = None
-        if kv_heads_self is not None and kv_heads_self <= 0:
-            kv_heads_self = None
-        # -- Input projection --
-        self.input_proj = nn.Sequential(
-            nn.Linear(in_dim, dim_feedforward),
-            nn.GELU(),
-            nn.Linear(dim_feedforward, hidden),
-            norm_class(hidden),
-        )
-        # -- Self-attention encoder --
-        self.encoder_layers = nn.ModuleList([
-            SelfAttentionEncoderLayer(
-                d_model=hidden,
-                num_heads=num_heads,
-                dim_ff=dim_feedforward,
-                dropout=dropout,
-                activation=activation,
-                kv_heads=kv_heads_self,
-                norm_class=norm_class,
-                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
-            )
-            for _ in range(encoder_layers)
-        ])
-        # -- Segment decoder (same structure as Perceiver version) --
-        # Note: for transformer arch, decoder_input_xattn is ignored because
-        # the decoder already cross-attends to the full encoded token sequence.
-        self.query_embed = nn.Embedding(segments, hidden)
-        self.decoder_layers = nn.ModuleList([
-            SegmentDecoderLayer(
-                d_model=hidden,
-                num_heads=num_heads,
-                dim_ff=dim_feedforward,
-                dropout=dropout,
-                activation=activation,
-                kv_heads_cross=kv_heads_cross,
-                kv_heads_self=kv_heads_self,
-                norm_class=norm_class,
-                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
-            )
-            for _ in range(decoder_layers)
-        ])
-        # -- Output head (shared logic with Perceiver version) --
-        if segment_param == "midpoint_dir_len":
-            self.segment_head = nn.Linear(hidden, 7)  # mid(3) + dir(3) + len(1)
-        else:
-            self.segment_head = nn.Linear(hidden, 6)  # mid(3) + half(3)
-        self.query_offsets = nn.Parameter(torch.zeros(segments, 2, 3))
-        nn.init.trunc_normal_(self.segment_head.weight, mean=0.0, std=1e-3)
-        if self.segment_head.bias is not None:
-            nn.init.zeros_(self.segment_head.bias)
-            if segment_param == "midpoint_dir_len":
-                # sigmoid(-2.2) ~ 0.1 default length in normalized space (~3m)
-                self.segment_head.bias.data[6] = -2.2
-        nn.init.normal_(self.query_offsets, mean=0.0, std=0.05)
-        self.segment_conf = segment_conf
-        if segment_conf:
-            self.conf_head = nn.Linear(hidden, 1)
-            nn.init.zeros_(self.conf_head.bias)
-    def forward(
-        self,
-        tokens: torch.Tensor,
-        mask: torch.Tensor | None = None,
-    ) -> dict[str, torch.Tensor | list]:
-        B = tokens.shape[0]
-        src = self.input_proj(tokens)
-        pad_mask = ~mask.bool() if mask is not None else None
-        # Encode: self-attention over all tokens
-        for layer in self.encoder_layers:
-            src = layer(src, key_padding_mask=pad_mask)
-        # Decode: segment queries cross-attend to encoded tokens
-        queries = self.query_embed.weight.unsqueeze(0).expand(B, -1, -1)
-        for layer in self.decoder_layers:
-            queries = layer(queries, src)
-        # Predict segments -> endpoints
-        if self.segment_param == "midpoint_dir_len":
-            raw = self.segment_head(queries)  # [B, S, 7]
-            mid = raw[:, :, :3] + self.query_offsets[:, 0, :].unsqueeze(0)
-            direction = torch.nn.functional.normalize(raw[:, :, 3:6], dim=-1)
-            length = torch.nn.functional.softplus(raw[:, :, 6:7]) * 0.1
-            half = direction * length * 0.5
-        else:
-            raw = self.segment_head(queries).view(B, self.segments, 2, 3)
-            raw = raw + self.query_offsets.unsqueeze(0)
-            mid, half = raw[:, :, 0], raw[:, :, 1]
-        seg_params = torch.stack([mid - half, mid + half], dim=2)
-        vertices = seg_params.reshape(B, self.out_vertices, 3)
-        edges = [[(2 * i, 2 * i + 1) for i in range(self.segments)] for _ in range(B)]
-        out = {"vertices": vertices, "segments": seg_params, "edges": edges}
-        if self.segment_conf:
-            out["conf"] = self.conf_head(queries).squeeze(-1)
-        return out
-# ---------------------------------------------------------------------------
-# End-to-end model: tokenizer embeddings + transformer/perceiver
 # ---------------------------------------------------------------------------
 class EdgeDepthSegmentsModel(nn.Module):
@@ -648,25 +487,9 @@ class EdgeDepthSegmentsModel(nn.Module):
         )
         if arch == "transformer":
-            self.segmenter = TransformerSegments(
-                segments=segments,
-                in_dim=self.tokenizer.out_dim,
-                hidden=hidden,
-                num_heads=num_heads,
-                kv_heads_cross=kv_heads_cross,
-                kv_heads_self=kv_heads_self,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout,
-                encoder_layers=encoder_layers,
-                decoder_layers=decoder_layers,
-                norm_class=norm_class,
-                activation=activation,
-                segment_conf=segment_conf,
-                segment_param=segment_param,
-                length_floor=length_floor,
-                decoder_input_xattn=decoder_input_xattn,
-                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
-            )
         else:
             self.segmenter = TokenTransformerSegments(
                 segments=segments,

         self.segments = segments
         self.out_vertices = segments * 2
         self.segment_param = segment_param
         self.decoder_input_xattn = decoder_input_xattn
         norm_class = norm_class or nn.LayerNorm
 # ---------------------------------------------------------------------------
+# End-to-end model: tokenizer embeddings + perceiver
 # ---------------------------------------------------------------------------
 class EdgeDepthSegmentsModel(nn.Module):
         )
         if arch == "transformer":
+            raise ValueError(
+                "arch='transformer' is no longer supported. "
+                "TransformerSegments has been removed; use arch='perceiver'.")
         else:
             self.segmenter = TokenTransformerSegments(
                 segments=segments,

s23dr_2026_example/sinkhorn.py CHANGED Viewed

@@ -10,26 +10,6 @@ to get useful gradients early and precise matching late.
 import torch
-def segment_pair_cost(pred_segments: torch.Tensor, gt_segments: torch.Tensor) -> torch.Tensor:
-    """Cost between pred and GT segments: midpoint + direction + length (decoupled).
-    pred_segments: [N, 2, 3], gt_segments: [M, 2, 3] -> [N, M]
-    """
-    p0, p1 = pred_segments[:, 0], pred_segments[:, 1]
-    g0, g1 = gt_segments[:, 0], gt_segments[:, 1]
-    mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)
-    mid_g, half_g = 0.5 * (g0 + g1), 0.5 * (g1 - g0)
-    d_mid = torch.cdist(mid_p, mid_g)
-    len_p = torch.linalg.norm(half_p, dim=-1, keepdim=True).clamp(min=1e-6)
-    len_g = torch.linalg.norm(half_g, dim=-1, keepdim=True).clamp(min=1e-6)
-    dir_p = half_p / len_p
-    dir_g = half_g / len_g
-    cos_angle = (dir_p[:, None, :] * dir_g[None, :, :]).sum(dim=-1)
-    d_dir = 1.0 - cos_angle.abs()
-    d_len = (len_p[:, None, :] - len_g[None, :, :]).squeeze(-1).abs()
-    return d_mid + d_dir + d_len
 def batched_sinkhorn_loss(
     pred_segments: torch.Tensor,
     gt_pad: torch.Tensor,
@@ -144,38 +124,3 @@ def batched_sinkhorn_loss(
     transport = torch.exp(log_u.unsqueeze(2) + log_v.unsqueeze(1) + log_k)
     return (transport * cost_pad).sum(dim=(1, 2))  # [B]
-# Keep the per-sample version for compatibility
-def sinkhorn_segment_loss(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    eps: float,
-    iters: int,
-    dustbin_cost: float,
-    pred_mass: torch.Tensor | None = None,
-) -> torch.Tensor:
-    if pred_segments.numel() == 0 or gt_segments.numel() == 0:
-        return pred_segments.new_tensor(dustbin_cost)
-    cost = segment_pair_cost(pred_segments, gt_segments)
-    n, m = cost.shape
-    if n == 0 or m == 0:
-        return cost.new_tensor(dustbin_cost)
-    cost_pad = torch.full((n + 1, m + 1), dustbin_cost, device=cost.device, dtype=cost.dtype)
-    cost_pad[:n, :m] = cost
-    cost_pad[-1, -1] = 0.0
-    denom = float(n + m)
-    a = torch.full((n + 1,), 1.0 / denom, device=cost.device, dtype=cost.dtype)
-    b = torch.full((m + 1,), 1.0 / denom, device=cost.device, dtype=cost.dtype)
-    a[-1] = m / denom
-    b[-1] = n / denom
-    log_a = torch.log(a + 1e-9)
-    log_b = torch.log(b + 1e-9)
-    log_k = -cost_pad / eps
-    log_u = torch.zeros_like(a)
-    log_v = torch.zeros_like(b)
-    for _ in range(iters):
-        log_u = log_a - torch.logsumexp(log_k + log_v[None, :], dim=1)
-        log_v = log_b - torch.logsumexp(log_k + log_u[:, None], dim=0)
-    transport = torch.exp(log_u[:, None] + log_v[None, :] + log_k)
-    return torch.sum(transport * cost_pad)

 import torch
 def batched_sinkhorn_loss(
     pred_segments: torch.Tensor,
     gt_pad: torch.Tensor,
     transport = torch.exp(log_u.unsqueeze(2) + log_v.unsqueeze(1) + log_k)
     return (transport * cost_pad).sum(dim=(1, 2))  # [B]

s23dr_2026_example/soft_hss_loss.py DELETED Viewed

@@ -1,507 +0,0 @@
-import torch
-def _softmin(values: torch.Tensor, dim: int, tau: float) -> torch.Tensor:
-    tau_t = torch.as_tensor(tau, device=values.device, dtype=values.dtype).clamp_min(1e-8)
-    return -tau_t * torch.logsumexp(-values / tau_t, dim=dim)
-def point_segment_distance_squared(
-    points: torch.Tensor,
-    seg_a: torch.Tensor,
-    seg_b: torch.Tensor,
-    eps: float = 1e-9,
-) -> torch.Tensor:
-    """
-    points: (P,3)
-    seg_a/seg_b: (S,3)
-    returns dist2: (P,S)
-    """
-    ab = seg_b - seg_a  # (S,3)
-    ab2 = (ab * ab).sum(dim=-1).clamp_min(eps)  # (S,)
-    ap = points[:, None, :] - seg_a[None, :, :]  # (P,S,3)
-    t = (ap * ab[None, :, :]).sum(dim=-1) / ab2[None, :]  # (P,S)
-    t = t.clamp(0.0, 1.0)
-    closest = seg_a[None, :, :] + t[:, :, None] * ab[None, :, :]
-    diff = points[:, None, :] - closest
-    return (diff * diff).sum(dim=-1)
-def distance_to_segments(
-    points: torch.Tensor,
-    segments: torch.Tensor,
-    eps: float = 1e-9,
-) -> torch.Tensor:
-    """
-    points: (P,3)
-    segments: (S,2,3)
-    returns min distance: (P,)
-    """
-    a = segments[:, 0]
-    b = segments[:, 1]
-    dist2 = point_segment_distance_squared(points, a, b, eps=eps)
-    return torch.sqrt(dist2.min(dim=1).values + eps)
-def soft_vertex_f1(
-    pred_vertices: torch.Tensor,
-    gt_vertices: torch.Tensor,
-    thresh: float,
-    tau: float = 0.05,
-    softmin_tau: float = 0.05,
-    eps: float = 1e-8,
-) -> torch.Tensor:
-    """
-    Soft surrogate for the Hungarian-thresholded corner F1 used by HSS.
-    Uses (soft) nearest-neighbor distances and a sigmoid threshold.
-    """
-    if pred_vertices.numel() == 0 or gt_vertices.numel() == 0:
-        return torch.zeros((), device=pred_vertices.device, dtype=pred_vertices.dtype)
-    pred = pred_vertices
-    gt = gt_vertices
-    diff = pred[:, None, :] - gt[None, :, :]
-    dist = torch.sqrt((diff * diff).sum(dim=-1) + eps)  # (P,G)
-    d_pred = _softmin(dist, dim=1, tau=softmin_tau)  # (P,)
-    d_gt = _softmin(dist, dim=0, tau=softmin_tau)    # (G,)
-    tau_t = torch.as_tensor(tau, device=dist.device, dtype=dist.dtype).clamp_min(1e-8)
-    thresh_t = torch.as_tensor(thresh, device=dist.device, dtype=dist.dtype)
-    p_match = torch.sigmoid((thresh_t - d_pred) / tau_t).mean()
-    r_match = torch.sigmoid((thresh_t - d_gt) / tau_t).mean()
-    return 2.0 * p_match * r_match / (p_match + r_match + eps)
-def soft_tube_iou_mc(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    radius: float,
-    n_samples: int = 4096,
-    tau: float = 0.05,
-    seed: int = 0,
-    eps: float = 1e-8,
-) -> torch.Tensor:
-    """
-    Soft surrogate for volumetric tube IoU (edge_thresh in HSS).
-    Samples points uniformly in a padded bbox around {pred,gt} endpoints.
-    Occupancy is sigmoid((radius - d(x, segments))/tau).
-    IoU is approximated by mean(min(occ_p, occ_g)) / mean(max(occ_p, occ_g)).
-    """
-    if pred_segments.numel() == 0 or gt_segments.numel() == 0:
-        return torch.zeros((), device=pred_segments.device, dtype=pred_segments.dtype)
-    pts_all = torch.cat([pred_segments.reshape(-1, 3), gt_segments.reshape(-1, 3)], dim=0)
-    pad = torch.as_tensor(radius, device=pts_all.device, dtype=pts_all.dtype)
-    lo = pts_all.min(dim=0).values - pad
-    hi = pts_all.max(dim=0).values + pad
-    gen = torch.Generator(device=pts_all.device)
-    gen.manual_seed(int(seed))
-    u = torch.rand((int(n_samples), 3), generator=gen, device=pts_all.device, dtype=pts_all.dtype)
-    x = lo[None, :] + u * (hi - lo)[None, :]
-    d_p = distance_to_segments(x, pred_segments, eps=eps)
-    d_g = distance_to_segments(x, gt_segments, eps=eps)
-    tau_t = torch.as_tensor(tau, device=pts_all.device, dtype=pts_all.dtype).clamp_min(1e-8)
-    rad_t = torch.as_tensor(radius, device=pts_all.device, dtype=pts_all.dtype)
-    occ_p = torch.sigmoid((rad_t - d_p) / tau_t)
-    occ_g = torch.sigmoid((rad_t - d_g) / tau_t)
-    inter = torch.minimum(occ_p, occ_g).mean()
-    union = torch.maximum(occ_p, occ_g).mean().clamp_min(eps)
-    return inter / union
-def soft_hss(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    gt_vertices: torch.Tensor,
-    vert_thresh: float = 0.5,
-    edge_thresh: float = 0.5,
-    tau: float = 0.05,
-    softmin_tau: float = 0.05,
-    n_samples: int = 4096,
-    seed: int = 0,
-    eps: float = 1e-8,
-):
-    """
-    Returns (soft_hss, soft_f1, soft_iou), all scalars in [0,1] (approximately).
-    """
-    pred_vertices = pred_segments.reshape(-1, 3)
-    f1 = soft_vertex_f1(pred_vertices, gt_vertices, thresh=vert_thresh, tau=tau, softmin_tau=softmin_tau, eps=eps)
-    iou = soft_tube_iou_mc(
-        pred_segments,
-        gt_segments,
-        radius=edge_thresh,
-        n_samples=n_samples,
-        tau=tau,
-        seed=seed,
-        eps=eps,
-    )
-    denom = (f1 + iou).clamp_min(eps)
-    hss = 2.0 * f1 * iou / denom
-    return hss, f1, iou
-# ---------------------------------------------------------------------------
-# Improved: Sinkhorn-matched vertex F1
-# ---------------------------------------------------------------------------
-#
-# The original soft_vertex_f1 uses independent softmin nearest-neighbor
-# distances, which allows multiple predicted vertices to claim the same GT
-# vertex. This inflates precision and fails to penalize duplicate vertices --
-# the exact failure mode that requires merge_vertices post-processing.
-#
-# This version uses Sinkhorn optimal transport to find a soft one-to-one
-# assignment between predicted and GT vertices, then computes precision and
-# recall from the matched distances. This is a better surrogate for the
-# Hungarian matching used by the real HSS metric.
-def sinkhorn_vertex_f1(
-    pred_vertices: torch.Tensor,
-    gt_vertices: torch.Tensor,
-    thresh: float = 0.5,
-    tau: float = 0.05,
-    eps_sinkhorn: float = 0.05,
-    iters: int = 20,
-    eps: float = 1e-8,
-) -> torch.Tensor:
-    """Soft vertex F1 using Sinkhorn matching (better aligned with real HSS).
-    Instead of independent nearest-neighbor distances (which allow double-
-    claiming), this uses optimal transport to find a soft one-to-one assignment
-    between predicted and GT vertices.
-    Returns a differentiable scalar in [0, 1].
-    """
-    if pred_vertices.numel() == 0 or gt_vertices.numel() == 0:
-        return torch.zeros((), device=pred_vertices.device, dtype=pred_vertices.dtype)
-    P = pred_vertices.shape[0]
-    G = gt_vertices.shape[0]
-    # Pairwise distance matrix (P, G)
-    dist = torch.cdist(pred_vertices, gt_vertices)
-    # Sinkhorn with dustbin: (P+1) x (G+1)
-    # Dustbin cost = thresh (unmatched vertices are "at threshold distance")
-    dustbin = thresh
-    cost_pad = torch.full((P + 1, G + 1), dustbin, device=dist.device, dtype=dist.dtype)
-    cost_pad[:P, :G] = dist
-    cost_pad[-1, -1] = 0.0
-    # Uniform masses with dustbin slack
-    denom = float(P + G)
-    a = torch.full((P + 1,), 1.0 / denom, device=dist.device, dtype=dist.dtype)
-    b = torch.full((G + 1,), 1.0 / denom, device=dist.device, dtype=dist.dtype)
-    a[-1] = G / denom  # pred dustbin absorbs unmatched GT
-    b[-1] = P / denom  # GT dustbin absorbs unmatched pred
-    # Log-domain Sinkhorn
-    log_a = torch.log(a + 1e-9)
-    log_b = torch.log(b + 1e-9)
-    log_k = -cost_pad / max(eps_sinkhorn, 1e-6)
-    log_u = torch.zeros_like(a)
-    log_v = torch.zeros_like(b)
-    for _ in range(iters):
-        log_u = log_a - torch.logsumexp(log_k + log_v[None, :], dim=1)
-        log_v = log_b - torch.logsumexp(log_k + log_u[:, None], dim=0)
-    # Transport plan (P+1, G+1)
-    transport = torch.exp(log_u[:, None] + log_v[None, :] + log_k)
-    # Extract the non-dustbin transport (P, G) -- these are the soft assignments
-    T = transport[:P, :G]
-    # For each predicted vertex, its matched distance is the transport-weighted
-    # average distance to GT vertices
-    # Normalize rows to sum to 1 (how much of this pred is matched vs dustbin)
-    row_sums = T.sum(dim=1).clamp_min(eps)
-    matched_dist_pred = (T * dist).sum(dim=1) / row_sums  # (P,)
-    match_weight_pred = row_sums * denom  # how much of this pred is matched (0-1 ish)
-    # Same for GT vertices (column perspective)
-    col_sums = T.sum(dim=0).clamp_min(eps)
-    matched_dist_gt = (T * dist).sum(dim=0) / col_sums  # (G,)
-    match_weight_gt = col_sums * denom
-    # Soft precision: fraction of pred vertices that are matched AND within threshold
-    tau_t = torch.as_tensor(tau, device=dist.device, dtype=dist.dtype).clamp_min(1e-8)
-    thresh_t = torch.as_tensor(thresh, device=dist.device, dtype=dist.dtype)
-    prec_per = match_weight_pred * torch.sigmoid((thresh_t - matched_dist_pred) / tau_t)
-    precision = prec_per.mean()
-    # Soft recall: fraction of GT vertices that are matched AND within threshold
-    rec_per = match_weight_gt * torch.sigmoid((thresh_t - matched_dist_gt) / tau_t)
-    recall = rec_per.mean()
-    return 2.0 * precision * recall / (precision + recall + eps)
-# ---------------------------------------------------------------------------
-# Improved: Segment-sampled tube IoU
-# ---------------------------------------------------------------------------
-#
-# The original soft_tube_iou_mc samples random points in the bounding box,
-# wasting most samples in empty space. This version samples along the segments
-# themselves, concentrating gradient signal where it matters.
-def _sample_along_segments(segments: torch.Tensor, n_per_seg: int = 64) -> torch.Tensor:
-    """Sample n_per_seg points uniformly along each segment.
-    segments: (S, 2, 3)
-    returns: (S * n_per_seg, 3)
-    """
-    t = torch.linspace(0, 1, n_per_seg, device=segments.device, dtype=segments.dtype)
-    # (S, 1, 3) + (1, N, 1) * (S, 1, 3) -> (S, N, 3)
-    a = segments[:, 0:1, :]
-    b = segments[:, 1:2, :]
-    pts = a + t[None, :, None] * (b - a)
-    return pts.reshape(-1, 3)
-def segment_sampled_tube_iou(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    radius: float = 0.5,
-    n_per_seg: int = 64,
-    tau: float = 0.05,
-    eps: float = 1e-8,
-) -> torch.Tensor:
-    """Soft tube IoU by sampling along segments instead of in the bounding box.
-    Samples points along predicted and GT segments, then checks what fraction
-    of each set falls within radius of the other. More sample-efficient than
-    bbox Monte Carlo and gives better gradients.
-    Returns a differentiable scalar in [0, 1].
-    """
-    if pred_segments.numel() == 0 or gt_segments.numel() == 0:
-        return torch.zeros((), device=pred_segments.device, dtype=pred_segments.dtype)
-    pred_pts = _sample_along_segments(pred_segments, n_per_seg)
-    gt_pts = _sample_along_segments(gt_segments, n_per_seg)
-    tau_t = torch.as_tensor(tau, device=pred_pts.device, dtype=pred_pts.dtype).clamp_min(1e-8)
-    rad_t = torch.as_tensor(radius, device=pred_pts.device, dtype=pred_pts.dtype)
-    # Precision: fraction of pred points within radius of any GT segment
-    d_pred = distance_to_segments(pred_pts, gt_segments, eps=eps)
-    prec = torch.sigmoid((rad_t - d_pred) / tau_t).mean()
-    # Recall: fraction of GT points within radius of any pred segment
-    d_gt = distance_to_segments(gt_pts, pred_segments, eps=eps)
-    rec = torch.sigmoid((rad_t - d_gt) / tau_t).mean()
-    # Soft IoU from precision and recall:
-    # IoU = intersection/union = (P*R) / (P + R - P*R) for occupancy overlap
-    return prec * rec / (prec + rec - prec * rec + eps)
-def soft_hss_v2(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    gt_vertices: torch.Tensor,
-    vert_thresh: float = 0.5,
-    edge_thresh: float = 0.5,
-    tau: float = 0.05,
-    sinkhorn_eps: float = 0.05,
-    sinkhorn_iters: int = 20,
-    n_per_seg: int = 64,
-    eps: float = 1e-8,
-):
-    """Improved soft HSS using Sinkhorn vertex matching + segment-sampled IoU.
-    Returns (soft_hss, soft_f1, soft_iou).
-    """
-    pred_vertices = pred_segments.reshape(-1, 3)
-    f1 = sinkhorn_vertex_f1(
-        pred_vertices, gt_vertices,
-        thresh=vert_thresh, tau=tau,
-        eps_sinkhorn=sinkhorn_eps, iters=sinkhorn_iters, eps=eps,
-    )
-    iou = segment_sampled_tube_iou(
-        pred_segments, gt_segments,
-        radius=edge_thresh, n_per_seg=n_per_seg, tau=tau, eps=eps,
-    )
-    denom = (f1 + iou).clamp_min(eps)
-    hss = 2.0 * f1 * iou / denom
-    return hss, f1, iou
-# ---------------------------------------------------------------------------
-# Batched versions for training speed
-# ---------------------------------------------------------------------------
-def batched_sinkhorn_vertex_f1(
-    pred_segments: torch.Tensor,
-    gt_pad: torch.Tensor,
-    gt_mask: torch.Tensor,
-    thresh: float | torch.Tensor = 0.5,
-    tau: float | torch.Tensor = 0.05,
-    eps_sinkhorn: float = 0.05,
-    iters: int = 10,
-    eps: float = 1e-8,
-) -> torch.Tensor:
-    """Batched Sinkhorn vertex F1 loss.
-    Args:
-        pred_segments: [B, S, 2, 3] predicted segments
-        gt_pad:        [B, M, 2, 3] padded GT segments
-        gt_mask:       [B, M] bool mask (True = valid GT segment)
-        thresh:        distance threshold for a vertex match (scalar or [B])
-        tau:           sigmoid temperature (scalar or [B])
-    Returns:
-        [B] per-sample (1 - F1) loss
-    """
-    B, S = pred_segments.shape[:2]
-    M = gt_pad.shape[1]
-    P = S * 2  # pred vertices (both endpoints)
-    # Allow per-sample thresh and tau ([B] tensors or scalars)
-    thresh_t = torch.as_tensor(thresh, device=pred_segments.device, dtype=pred_segments.dtype)
-    if thresh_t.dim() == 0:
-        thresh_t = thresh_t.expand(B)
-    tau_t = torch.as_tensor(tau, device=pred_segments.device, dtype=pred_segments.dtype)
-    if tau_t.dim() == 0:
-        tau_t = tau_t.expand(B)
-    tau_t = tau_t.clamp_min(1e-8)
-    pred_verts = pred_segments.reshape(B, P, 3)
-    gt_verts = gt_pad.reshape(B, M * 2, 3)  # will mask invalid ones
-    # Build GT vertex mask: each valid segment contributes 2 vertices
-    gt_vert_mask = gt_mask.unsqueeze(2).expand(B, M, 2).reshape(B, M * 2)
-    G = M * 2
-    # Pairwise distances [B, P, G]
-    dist = torch.linalg.norm(
-        pred_verts.unsqueeze(2) - gt_verts.unsqueeze(1), dim=-1)
-    # Mask invalid GT with high distance
-    dist = torch.where(gt_vert_mask.unsqueeze(1), dist, thresh_t[:, None, None] * 10.0)
-    # Sinkhorn matching: [B, P+1, G+1]
-    cost_pad = thresh_t[:, None, None].expand(B, P + 1, G + 1).clone()
-    cost_pad[:, :P, :G] = dist
-    cost_pad[:, -1, -1] = 0.0
-    gt_counts = gt_vert_mask.sum(dim=1).float()  # [B]
-    n = float(P)
-    denom = n + gt_counts  # [B]
-    a = (1.0 / denom).unsqueeze(1).expand(B, P + 1).clone()
-    a[:, -1] = gt_counts / denom
-    b = (1.0 / denom).unsqueeze(1).expand(B, G + 1).clone()
-    b[:, -1] = n / denom
-    b[:, :G] = b[:, :G] * gt_vert_mask.float()
-    log_a = torch.log(a + 1e-9)
-    log_b = torch.log(b + 1e-9)
-    log_k = -cost_pad / max(eps_sinkhorn, 1e-6)
-    log_u = torch.zeros_like(a)
-    log_v = torch.zeros_like(b)
-    for _ in range(iters):
-        log_u = log_a - torch.logsumexp(log_k + log_v.unsqueeze(1), dim=2)
-        log_v = log_b - torch.logsumexp(log_k + log_u.unsqueeze(2), dim=1)
-    transport = torch.exp(log_u.unsqueeze(2) + log_v.unsqueeze(1) + log_k)
-    T = transport[:, :P, :G]  # [B, P, G]
-    # Matched distances
-    row_sums = T.sum(dim=2).clamp_min(eps)
-    matched_d_pred = (T * dist).sum(dim=2) / row_sums  # [B, P]
-    w_pred = row_sums * denom.unsqueeze(1)
-    col_sums = T.sum(dim=1).clamp_min(eps)
-    matched_d_gt = (T * dist).sum(dim=1) / col_sums  # [B, G]
-    w_gt = col_sums * denom.unsqueeze(1)
-    precision = (w_pred * torch.sigmoid((thresh_t[:, None] - matched_d_pred) / tau_t[:, None])).mean(dim=1)
-    recall_raw = w_gt * torch.sigmoid((thresh_t[:, None] - matched_d_gt) / tau_t[:, None])
-    # Mask invalid GT vertices in recall
-    recall = (recall_raw * gt_vert_mask.float()).sum(dim=1) / gt_counts.clamp_min(1.0)
-    f1 = 2.0 * precision * recall / (precision + recall + eps)
-    return 1.0 - f1  # return loss (1 - F1)
-def batched_segment_sampled_iou(
-    pred_segments: torch.Tensor,
-    gt_pad: torch.Tensor,
-    gt_mask: torch.Tensor,
-    radius: float | torch.Tensor = 0.5,
-    n_per_seg: int = 32,
-    tau: float | torch.Tensor = 0.05,
-    eps: float = 1e-8,
-) -> torch.Tensor:
-    """Batched segment-sampled tube IoU loss.
-    Returns [B] per-sample (1 - IoU) loss.
-    """
-    B, S = pred_segments.shape[:2]
-    M = gt_pad.shape[1]
-    # Allow per-sample radius and tau ([B] tensors or scalars)
-    rad_t = torch.as_tensor(radius, device=pred_segments.device, dtype=pred_segments.dtype)
-    if rad_t.dim() == 0:
-        rad_t = rad_t.expand(B)
-    tau_t = torch.as_tensor(tau, device=pred_segments.device, dtype=pred_segments.dtype)
-    if tau_t.dim() == 0:
-        tau_t = tau_t.expand(B)
-    tau_t = tau_t.clamp_min(1e-8)
-    # Sample points along segments
-    t = torch.linspace(0, 1, n_per_seg, device=pred_segments.device, dtype=pred_segments.dtype)
-    # Pred points: [B, S*n_per_seg, 3]
-    pa = pred_segments[:, :, 0:1, :]  # [B, S, 1, 3]
-    pb = pred_segments[:, :, 1:2, :]
-    pred_pts = (pa + t[None, None, :, None] * (pb - pa)).reshape(B, S * n_per_seg, 3)
-    # GT points: [B, M*n_per_seg, 3]
-    ga = gt_pad[:, :, 0:1, :]
-    gb = gt_pad[:, :, 1:2, :]
-    gt_pts = (ga + t[None, None, :, None] * (gb - ga)).reshape(B, M * n_per_seg, 3)
-    # For each pred point, min distance to any GT segment endpoint samples
-    d_pred_to_gt = torch.cdist(pred_pts, gt_pts)  # [B, S*n, M*n]
-    d_pred = d_pred_to_gt.min(dim=2).values  # [B, S*n]
-    prec = torch.sigmoid((rad_t[:, None] - d_pred) / tau_t[:, None]).mean(dim=1)  # [B]
-    d_gt_to_pred = d_pred_to_gt.min(dim=1).values  # [B, M*n]
-    # Mask invalid GT points
-    gt_pt_mask = gt_mask.unsqueeze(2).expand(B, M, n_per_seg).reshape(B, M * n_per_seg)
-    rec_raw = torch.sigmoid((rad_t[:, None] - d_gt_to_pred) / tau_t[:, None])
-    rec = (rec_raw * gt_pt_mask.float()).sum(dim=1) / gt_pt_mask.float().sum(dim=1).clamp_min(1.0)
-    iou = prec * rec / (prec + rec - prec * rec + eps)
-    return 1.0 - iou  # return loss
-def batched_soft_hss_v2(pred_segments, gt_pad, gt_mask,
-                         vert_thresh=0.5, edge_thresh=0.5, tau=0.05,
-                         sinkhorn_iters=10, n_per_seg=32):
-    """Batched soft HSS loss. Returns [B] per-sample (1 - HSS)."""
-    f1_loss = batched_sinkhorn_vertex_f1(
-        pred_segments, gt_pad, gt_mask,
-        thresh=vert_thresh, tau=tau, iters=sinkhorn_iters)
-    iou_loss = batched_segment_sampled_iou(
-        pred_segments, gt_pad, gt_mask,
-        radius=edge_thresh, n_per_seg=n_per_seg, tau=tau)
-    f1 = 1.0 - f1_loss
-    iou = 1.0 - iou_loss
-    hss = 2.0 * f1 * iou / (f1 + iou + 1e-8)
-    return 1.0 - hss

s23dr_2026_example/train.py ADDED Viewed

	@@ -0,0 +1,530 @@

+#!/usr/bin/env python3
+"""
+Training script for S23DR 2026.
+Usage:
+  python -m s23dr_2026_example.train --cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train --steps 80000 --aug-rotate
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path as _Path
+if __package__ is None or __package__ == "":
+    _here = _Path(__file__).resolve().parent
+    if str(_here.parent) not in sys.path:
+        sys.path.insert(0, str(_here.parent))
+    __package__ = _here.name
+import argparse
+import gc
+import json
+import math
+import subprocess
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from .tokenizer import EdgeDepthSequenceConfig
+from .model import EdgeDepthSegmentsModel
+from .data import build_loader, build_tokens
+from .losses import compute_loss, _loss_inner
+# Re-export for eval scripts
+from .data import HFCachedDataset, collate as _collate  # noqa: F401
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    p = argparse.ArgumentParser(description="S23DR 2026 training")
+    p.add_argument("--cache-dir", default=None, help="HF dataset path (hf://repo:split)")
+    p.add_argument("--val-cache-dir", default="", help="Separate cache for validation")
+    p.add_argument("--seq-len", type=int, default=2048,
+                   help="Input sequence length (2048 or 4096, must match dataset)")
+    p.add_argument("--arch", choices=["perceiver", "transformer"], default="perceiver",
+                   help="perceiver=latent bottleneck, transformer=full self-attention encoder")
+    p.add_argument("--segments", type=int, default=32)
+    p.add_argument("--hidden", type=int, default=128)
+    p.add_argument("--ff", type=int, default=512)
+    p.add_argument("--latent-tokens", type=int, default=128)
+    p.add_argument("--latent-layers", type=int, default=7)
+    p.add_argument("--encoder-layers", type=int, default=4,
+                   help="Encoder layers (transformer arch only)")
+    p.add_argument("--pre-encoder-layers", type=int, default=0,
+                   help="Self-attn layers on full token sequence before perceiver bottleneck")
+    p.add_argument("--decoder-layers", type=int, default=3)
+    p.add_argument("--decoder-input-xattn", action="store_true",
+                   help="Add cross-attention from segment queries to input tokens in each decoder layer")
+    p.add_argument("--qk-norm", action="store_true",
+                   help="Normalize Q and K per-head with learned temperature (stabilizes wide models)")
+    p.add_argument("--qk-norm-type", choices=["l2", "rms"], default="l2",
+                   help="QK-norm type: l2 (unit sphere) or rms (RMSNorm, preserves magnitudes)")
+    p.add_argument("--learnable-fourier", action="store_true",
+                   help="Make Fourier positional encoding learnable (vs fixed random)")
+    p.add_argument("--num-heads", type=int, default=4, help="Attention heads")
+    p.add_argument("--kv-heads-cross", type=int, default=2,
+                   help="KV heads for cross-attention (GQA; 0 = standard MHA)")
+    p.add_argument("--kv-heads-self", type=int, default=2,
+                   help="KV heads for self-attention (GQA; 0 = standard MHA)")
+    p.add_argument("--cross-attn-interval", type=int, default=4,
+                   help="Perceiver cross-attention frequency (every N latent layers)")
+    p.add_argument("--dropout", type=float, default=0.1)
+    p.add_argument("--weight-decay", type=float, default=0.01, help="AdamW weight decay")
+    p.add_argument("--steps", type=int, default=5000)
+    p.add_argument("--batch-size", type=int, default=32)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--adam-betas", default="0.9,0.95", help="AdamW beta1,beta2")
+    p.add_argument("--warmup", type=int, default=200, help="LR warmup steps")
+    p.add_argument("--cosine-decay", action="store_true",
+                   help="Cosine decay LR after warmup (to lr*0.01 at end)")
+    p.add_argument("--cooldown-start", type=int, default=0,
+                   help="Step to begin linear cooldown to lr*0.01 (0=disabled, constant LR after warmup)")
+    p.add_argument("--cooldown-steps", type=int, default=0,
+                   help="Number of steps for linear cooldown (0=no cooldown)")
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--deterministic", action="store_true",
+                   help="Force deterministic mode (disables torch.compile, slower but bit-reproducible)")
+    p.add_argument("--varifold-weight", type=float, default=0.0)
+    p.add_argument("--varifold-cross-only", action="store_true",
+                   help="Drop varifold self-energy (avoids O(S^2) spike, sinkhorn handles repulsion)")
+    p.add_argument("--sinkhorn-weight", type=float, default=1.0)
+    p.add_argument("--sinkhorn-eps", type=float, default=0.1,
+                   help="Sinkhorn regularization (larger = softer matching, stronger gradients)")
+    p.add_argument("--sinkhorn-eps-start", type=float, default=None,
+                   help="Starting eps for epsilon annealing (anneals to --sinkhorn-eps). None=no annealing.")
+    p.add_argument("--sinkhorn-eps-schedule", choices=["linear", "sqrt", "none"], default="none",
+                   help="Eps annealing schedule: linear, sqrt, or none (default: no annealing)")
+    p.add_argument("--sinkhorn-iters", type=int, default=20,
+                   help="Sinkhorn iterations")
+    p.add_argument("--sinkhorn-dustbin", type=float, default=0.3,
+                   help="Sinkhorn dustbin cost in normalized space")
+    p.add_argument("--endpoint-weight", type=float, default=0.0,
+                   help="Weight for endpoint distance loss (sinkhorn-matched, symmetric)")
+    p.add_argument("--endpoint-warmup", type=int, default=0,
+                   help="Steps to linearly warm up endpoint weight from 0 (0=instant)")
+    p.add_argument("--aug-rotate", action="store_true")
+    p.add_argument("--aug-jitter", type=float, default=0.0,
+                   help="Point position jitter std in normalized space (0=disabled, try 0.005)")
+    p.add_argument("--aug-drop", type=float, default=0.0,
+                   help="Fraction of points to randomly drop (0=disabled, try 0.1)")
+    p.add_argument("--aug-flip", action="store_true",
+                   help="Random mirror along X axis (50%% chance)")
+    p.add_argument("--rms-norm", action="store_true", default=True,
+                   help="Use RMSNorm (default). Use --no-rms-norm for LayerNorm")
+    p.add_argument("--no-rms-norm", dest="rms_norm", action="store_false")
+    p.add_argument("--activation", default="gelu", help="FFN activation: gelu, relu, relu_sq")
+    p.add_argument("--behind-emb-dim", type=int, default=8,
+                   help="Behind-gestalt embedding dim (0 to disable)")
+    p.add_argument("--vote-features", action="store_true",
+                   help="Add n_views_voted + vote_frac as raw token features (requires v2 data)")
+    p.add_argument("--segment-param", choices=["midpoint_halfvec", "midpoint_dir_len"],
+                   default="midpoint_halfvec",
+                   help="Output parameterization: halfvec (default) or decoupled direction+length")
+    p.add_argument("--length-floor", type=float, default=0.0,
+                   help="Minimum segment length for midpoint_dir_len (0=no floor)")
+    p.add_argument("--segment-conf", action="store_true",
+                   help="Add per-segment confidence head (use with --conf-thresh at eval)")
+    p.add_argument("--conf-weight", type=float, default=0.0,
+                   help="Weight for confidence loss (requires --segment-conf)")
+    p.add_argument("--conf-mode", choices=["sinkhorn", "sinkhorn_detach"], default="sinkhorn",
+                   help="Confidence training: 'match'=BCE, 'sinkhorn'=OT mass, 'sinkhorn_detach'=OT mass (detached)")
+    p.add_argument("--conf-clamp-min", type=float, default=None,
+                   help="Clamp conf logits to this minimum before sigmoid (e.g., -5)")
+    p.add_argument("--conf-head-wd", type=float, default=None,
+                   help="Separate weight decay for conf head (default: same as other params)")
+    p.add_argument("--ema-decay", type=float, default=0.0,
+                   help="EMA decay rate (0=disabled, try 0.9999). Saves EMA weights in checkpoints.")
+    p.add_argument("--out-dir", default=str(_Path(__file__).resolve().parent / "runs"))
+    p.add_argument("--resume", default="")
+    p.add_argument("--cpu", action="store_true")
+    p.add_argument("--args-from", default=None,
+                   help="Load defaults from a run's args.json (CLI flags override)")
+    # If --args-from is specified, load defaults from that JSON file first,
+    # then let CLI flags override.
+    raw_args = p.parse_args()
+    if raw_args.args_from is not None:
+        import json as _json
+        args_path = _Path(raw_args.args_from)
+        if not args_path.exists():
+            raise FileNotFoundError(f"--args-from file not found: {args_path}")
+        saved = _json.loads(args_path.read_text())
+        valid_dests = {a.dest for a in p._actions}
+        defaults = {}
+        for k, v in saved.items():
+            if k in valid_dests and k != "args_from":
+                defaults[k] = v
+        p.set_defaults(**defaults)
+        args = p.parse_args()
+        print(f"Loaded defaults from {args_path} (CLI flags override)")
+    else:
+        args = raw_args
+    # Validate required args
+    if not args.cache_dir:
+        p.error("--cache-dir is required (either directly or via --args-from)")
+    # Validate arg compatibility
+    if args.arch == "transformer":
+        perceiver_only = []
+        if args.latent_tokens != 128:
+            perceiver_only.append(f"--latent-tokens={args.latent_tokens}")
+        if args.latent_layers != 7:
+            perceiver_only.append(f"--latent-layers={args.latent_layers}")
+        if args.pre_encoder_layers != 0:
+            perceiver_only.append(f"--pre-encoder-layers={args.pre_encoder_layers}")
+        if args.cross_attn_interval != 4:
+            perceiver_only.append(f"--cross-attn-interval={args.cross_attn_interval}")
+        if perceiver_only:
+            raise ValueError(
+                f"Args {', '.join(perceiver_only)} have no effect with --arch transformer. "
+                f"Use --arch perceiver or remove them.")
+    if args.conf_weight > 0 and not args.segment_conf:
+        raise ValueError("--conf-weight requires --segment-conf")
+    if args.conf_mode in ("sinkhorn", "sinkhorn_detach") and args.sinkhorn_weight == 0:
+        raise ValueError("--conf-mode sinkhorn requires --sinkhorn-weight > 0")
+    if args.cosine_decay and args.cooldown_start > 0:
+        raise ValueError("--cosine-decay and --cooldown-start are mutually exclusive")
+    device = torch.device("cpu" if args.cpu else ("cuda" if torch.cuda.is_available() else "cpu"))
+    print(f"Device: {device}")
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+    # Output
+    import hashlib, os
+    args_hash = hashlib.md5(json.dumps(vars(args), sort_keys=True).encode()).hexdigest()[:4]
+    run_tag = time.strftime("%Y%m%d_%H%M%S") + f"_{args_hash}_{os.getpid() % 10000:04d}"
+    out_dir = Path(args.out_dir) / run_tag
+    out_dir.mkdir(parents=True, exist_ok=True)
+    (out_dir / "checkpoints").mkdir(exist_ok=True)
+    # Tee stdout/stderr to run dir
+    import sys as _sys
+    _log_path = out_dir / "train.log"
+    class _Tee:
+        def __init__(self, path, stream):
+            self._file = open(path, "a")
+            self._stream = stream
+        def write(self, data):
+            self._stream.write(data)
+            self._file.write(data)
+            self._file.flush()
+        def flush(self):
+            self._stream.flush()
+            self._file.flush()
+    _sys.stdout = _Tee(_log_path, _sys.stdout)
+    _sys.stderr = _Tee(_log_path, _sys.stderr)
+    git_sha = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True,
+                             cwd=str(_Path(__file__).parent)).stdout.strip()
+    git_dirty = subprocess.run(["git", "diff", "--quiet"], capture_output=True,
+                               cwd=str(_Path(__file__).parent)).returncode != 0
+    run_info = {**vars(args), "git_sha": git_sha, "git_dirty": git_dirty}
+    (out_dir / "args.json").write_text(json.dumps(run_info, indent=2, sort_keys=True) + "\n")
+    # Set varifold cross-only mode before compile
+    if args.varifold_cross_only:
+        from . import losses as L
+        L.VARIFOLD_CROSS_ONLY = True
+        print("Varifold: cross-only mode (no self-energy)")
+    # Model
+    seq_len = args.seq_len
+    norm_class = torch.nn.RMSNorm if args.rms_norm else None
+    seq_cfg = EdgeDepthSequenceConfig(seq_len=seq_len)
+    model = EdgeDepthSegmentsModel(
+        seq_cfg=seq_cfg, segments=args.segments, hidden=args.hidden,
+        num_heads=args.num_heads, kv_heads_cross=args.kv_heads_cross,
+        kv_heads_self=args.kv_heads_self,
+        dim_feedforward=args.ff, dropout=args.dropout,
+        latent_tokens=args.latent_tokens, latent_layers=args.latent_layers,
+        decoder_layers=args.decoder_layers, cross_attn_interval=args.cross_attn_interval,
+        norm_class=norm_class, activation=args.activation,
+        segment_conf=args.segment_conf,
+        segment_param=args.segment_param,
+        length_floor=args.length_floor,
+        arch=args.arch, encoder_layers=args.encoder_layers,
+        pre_encoder_layers=args.pre_encoder_layers,
+        behind_emb_dim=args.behind_emb_dim,
+        use_vote_features=args.vote_features,
+        decoder_input_xattn=args.decoder_input_xattn,
+        qk_norm=args.qk_norm,
+        qk_norm_type=args.qk_norm_type,
+        learnable_fourier=args.learnable_fourier,
+    ).to(device)
+    try:
+        from torchinfo import summary
+        summary(model.segmenter,
+                input_data=[torch.zeros(1, seq_len, model.tokenizer.out_dim, device=device),
+                            torch.ones(1, seq_len, device=device, dtype=torch.bool)],
+                col_names=("input_size", "output_size", "num_params"), verbose=1)
+    except ImportError:
+        pass
+    print(f"Total params: {sum(p.numel() for p in model.parameters()):,}")
+    # Compile (skip in deterministic mode for bit-reproducibility)
+    torch.set_float32_matmul_precision("high")
+    if args.deterministic:
+        torch.use_deterministic_algorithms(True)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        import os
+        os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":16:8")
+        print("Deterministic mode: no torch.compile, bit-reproducible but ~3x slower")
+    elif device.type == "cuda":
+        model.segmenter = torch.compile(model.segmenter, mode="reduce-overhead", fullgraph=True)
+        from . import losses as L
+        L._loss_fn = torch.compile(_loss_inner, mode="reduce-overhead", fullgraph=True)
+        print("Compiled model + loss (reduce-overhead, fullgraph)")
+    # EMA
+    ema_model = None
+    if args.ema_decay > 0:
+        from copy import deepcopy
+        ema_model = deepcopy(model).eval()
+        for p_ema in ema_model.parameters():
+            p_ema.requires_grad_(False)
+        print(f"EMA enabled (decay={args.ema_decay})")
+    # Resume
+    start_step = 0
+    if args.resume:
+        ckpt = torch.load(args.resume, map_location=device, weights_only=False)
+        try:
+            model.load_state_dict(ckpt["model"])
+        except RuntimeError:
+            state = {k.replace("segmenter._orig_mod.", "segmenter."): v
+                     for k, v in ckpt["model"].items()}
+            model.load_state_dict(state)
+        start_step = ckpt.get("step", 0)
+        print(f"Resumed from {args.resume} at step {start_step}")
+    betas = tuple(float(x) for x in args.adam_betas.split(","))
+    # Optimizer: AdamW with optional separate conf_head weight decay
+    conf_wd = args.conf_head_wd if args.conf_head_wd is not None else args.weight_decay
+    if args.conf_head_wd is not None:
+        conf_decay_params = []
+        other_params = []
+        for name, param in model.named_parameters():
+            if not param.requires_grad:
+                continue
+            if 'conf_head' in name:
+                conf_decay_params.append(param)
+            else:
+                other_params.append(param)
+        param_groups = [
+            {"params": other_params, "weight_decay": args.weight_decay},
+            {"params": conf_decay_params, "weight_decay": conf_wd},
+        ]
+        print(f"Conf head WD: {conf_wd} ({len(conf_decay_params)} params)")
+    else:
+        param_groups = model.parameters()
+    opt = torch.optim.AdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay,
+                            betas=betas)
+    if args.resume and "optimizer" in ckpt:
+        opt.load_state_dict(ckpt["optimizer"])
+    # Data
+    torch.manual_seed(args.seed + 7919)
+    np.random.seed(args.seed + 7919)
+    train_loader = build_loader(args.cache_dir, args.batch_size, aug_rotate=args.aug_rotate,
+                                aug_jitter=args.aug_jitter, aug_drop=args.aug_drop,
+                                aug_flip=args.aug_flip)
+    val_loader = build_loader(args.val_cache_dir, args.batch_size) if args.val_cache_dir else None
+    data_iter = iter(train_loader)
+    # Intervals
+    log_int = max(1, min(50, args.steps // 20))
+    ckpt_int = 5000
+    val_int = ckpt_int if val_loader else 0
+    # Training loop
+    global_step = start_step
+    loss_ema, loss_sq_ema = 0.0, 0.0
+    t_start = time.perf_counter()
+    print(f"Training for {args.steps} steps | {args.segments}seg "
+          f"{args.hidden}h {args.latent_tokens}x{args.latent_layers}L "
+          f"{args.decoder_layers}D")
+    # Pre-fetch first batch
+    try:
+        next_batch = next(data_iter)
+    except StopIteration:
+        data_iter = iter(train_loader)
+        next_batch = next(data_iter)
+    # Freeze GC after setup to eliminate stalls during training
+    gc.collect()
+    gc.freeze()
+    gc.disable()
+    amp_ctx = torch.autocast(device_type='cuda', dtype=torch.bfloat16,
+                             enabled=(device.type == 'cuda'))
+    while global_step < args.steps:
+        tokens, masks, gt_list, scales, meta = build_tokens(next_batch, model, device)
+        # Epsilon annealing
+        if args.sinkhorn_eps_start is not None and args.sinkhorn_eps_start != args.sinkhorn_eps:
+            if args.sinkhorn_eps_schedule == "sqrt":
+                ratio_sq = (args.sinkhorn_eps_start / args.sinkhorn_eps) ** 2
+                t0 = max(args.steps * 0.8 / max(ratio_sq - 1, 1e-6), 1.0)
+                current_eps = args.sinkhorn_eps_start / math.sqrt(1 + global_step / t0)
+                current_eps = max(current_eps, args.sinkhorn_eps)
+            else:
+                frac = min(global_step / max(args.steps * 0.8, 1), 1.0)
+                current_eps = args.sinkhorn_eps_start + frac * (args.sinkhorn_eps - args.sinkhorn_eps_start)
+        else:
+            current_eps = args.sinkhorn_eps
+        with amp_ctx:
+            out = model.forward_tokens(tokens, masks)
+            pred = out["segments"]
+            conf = out.get("conf")
+            # Endpoint weight warmup
+            if args.endpoint_warmup > 0 and global_step < args.endpoint_warmup:
+                current_ep_w = args.endpoint_weight * global_step / args.endpoint_warmup
+            else:
+                current_ep_w = args.endpoint_weight
+            loss, terms = compute_loss(pred, gt_list, scales.to(device), device,
+                                       args.varifold_weight, args.sinkhorn_weight,
+                                       endpoint_w=current_ep_w,
+                                       conf_logits=conf, conf_weight=args.conf_weight,
+                                       conf_mode=args.conf_mode,
+                                       sinkhorn_eps=current_eps,
+                                       sinkhorn_iters=args.sinkhorn_iters,
+                                       sinkhorn_dustbin=args.sinkhorn_dustbin,
+                                       conf_clamp_min=args.conf_clamp_min)
+        loss_val = loss.item()
+        # Adaptive loss spike detection
+        if global_step < 100:
+            loss_ema = loss_val if global_step == start_step else 0.9 * loss_ema + 0.1 * loss_val
+            loss_sq_ema = loss_val**2 if global_step == start_step else 0.9 * loss_sq_ema + 0.1 * loss_val**2
+        else:
+            loss_ema = 0.99 * loss_ema + 0.01 * loss_val
+            loss_sq_ema = 0.99 * loss_sq_ema + 0.01 * loss_val**2
+        loss_std = max(math.sqrt(max(loss_sq_ema - loss_ema**2, 0)), 1e-6)
+        spike_thresh = loss_ema + 5 * loss_std
+        # Skip on total loss spike or NaN
+        if not math.isfinite(loss_val) or loss_val > max(spike_thresh, 0.5):
+            sample_ids = [m.get("sample_id", "?") for m in meta]
+            skip_reason = f"loss={loss_val:.2f} > thresh={spike_thresh:.2f}"
+            print(f"Step {global_step}: {skip_reason}, skipping (samples: {sample_ids[:3]})")
+            with open(out_dir / "skipped_samples.jsonl", "a") as f:
+                f.write(json.dumps({"step": global_step, "reason": skip_reason,
+                                    "samples": sample_ids}) + "\n")
+            try:
+                next_batch = next(data_iter)
+            except StopIteration:
+                data_iter = iter(train_loader)
+                next_batch = next(data_iter)
+            continue
+        opt.zero_grad()
+        loss.backward()
+        # Fetch next batch while GPU finishes backward
+        try:
+            next_batch = next(data_iter)
+        except StopIteration:
+            data_iter = iter(train_loader)
+            next_batch = next(data_iter)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+        # LR schedule: warmup -> constant -> optional cooldown or cosine
+        if global_step < args.warmup:
+            lr = args.lr * (global_step + 1) / max(1, args.warmup)
+        elif args.cosine_decay:
+            progress = (global_step - args.warmup) / max(1, args.steps - args.warmup)
+            lr = args.lr * (0.01 + 0.99 * 0.5 * (1 + math.cos(math.pi * progress)))
+        elif args.cooldown_start > 0 and global_step >= args.cooldown_start:
+            progress = (global_step - args.cooldown_start) / max(1, args.cooldown_steps)
+            lr = args.lr * max(0.01, 1.0 - 0.99 * min(1.0, progress))
+        else:
+            lr = args.lr
+        for pg in opt.param_groups:
+            pg["lr"] = lr
+        opt.step()
+        global_step += 1
+        # EMA update
+        if ema_model is not None:
+            decay = args.ema_decay
+            with torch.no_grad():
+                for p_ema, p_model in zip(ema_model.parameters(), model.parameters()):
+                    p_ema.lerp_(p_model, 1.0 - decay)
+        # Log
+        entry = {"step": global_step, "ts": time.time(), "loss": loss.item(), "lr": lr}
+        entry.update({k: v.item() for k, v in terms.items()})
+        if global_step % log_int == 0:
+            grad_norm = sum(p.grad.norm().item()**2 for p in model.parameters()
+                            if p.grad is not None) ** 0.5
+            entry["grad_norm"] = grad_norm
+        if global_step % log_int == 0:
+            ms = (time.perf_counter() - t_start) / log_int * 1000
+            t_start = time.perf_counter()
+            t_str = " ".join(f"{k}={v:.4f}" for k, v in terms.items())
+            print(f"[{global_step}/{args.steps}] loss={loss.item():.4f} {t_str} "
+                  f"lr={lr:.2e} gnorm={entry.get('grad_norm', 0):.3f} [{ms:.0f}ms/step]")
+        if val_int > 0 and global_step % val_int == 0:
+            try:
+                vl_list = []
+                with torch.no_grad(), amp_ctx:
+                    for vb in val_loader:
+                        vt, vm, vg, vs, _ = build_tokens(vb, model, device)
+                        vo = model.forward_tokens(vt, vm)
+                        vl, _ = compute_loss(vo["segments"], vg, vs.to(device), device,
+                                             args.varifold_weight, args.sinkhorn_weight)
+                        if math.isfinite(vl.item()):
+                            vl_list.append(vl.item())
+                if vl_list:
+                    val_loss = float(np.mean(vl_list))
+                    print(f"  val_loss={val_loss:.4f}")
+                    entry["val_loss"] = val_loss
+            except Exception as e:
+                print(f"  val eval failed: {e}")
+        # Write log entry
+        with open(out_dir / "history.jsonl", "a") as f:
+            f.write(json.dumps(entry) + "\n")
+        if global_step % ckpt_int == 0:
+            try:
+                gc.enable(); gc.collect(); gc.freeze(); gc.disable()
+                torch.cuda.empty_cache()
+                save_dict = {"step": global_step, "model": model.state_dict(),
+                             "optimizer": opt.state_dict(), "args": vars(args)}
+                if ema_model is not None:
+                    save_dict["ema_model"] = ema_model.state_dict()
+                torch.save(save_dict, out_dir / "checkpoints" / f"step{global_step:06d}.pt")
+            except Exception as e:
+                print(f"  checkpoint save failed: {e}")
+    # Final save
+    save_dict = {"step": global_step, "model": model.state_dict(),
+                 "optimizer": opt.state_dict(), "args": vars(args)}
+    if ema_model is not None:
+        save_dict["ema_model"] = ema_model.state_dict()
+    torch.save(save_dict, out_dir / "checkpoints" / "final.pt")
+    print(f"Done. {global_step} steps. Output: {out_dir}")
+if __name__ == "__main__":
+    main()

s23dr_2026_example/varifold.py CHANGED Viewed

@@ -1,25 +1,11 @@
 import torch
 from .wire_varifold_kernels import (
-    loss_semi_lobatto3,
-    loss_semi_lobatto3_mix,
-    loss_semi_lobatto3_mix_simple,
-    loss_simpson3,
     loss_simpson3_batch,
-    loss_simpson3_mix,
     loss_simpson3_mix_batch,
-    loss_simpson3_lenpow,
-    loss_simpson3_lenpow_mix,
-    loss_semi_legendre,
 )
-def edges_to_segments(vertices, edges) -> torch.Tensor:
-    verts = torch.as_tensor(vertices, dtype=torch.float32)
-    idx = torch.as_tensor(edges, dtype=torch.long)
-    return torch.stack([verts[idx[:, 0]], verts[idx[:, 1]]], dim=1)
 def segments_to_vertices_edges(segments: torch.Tensor):
     segs = torch.as_tensor(segments, dtype=torch.float32)
     vertices = segs.reshape(-1, 3)
@@ -27,52 +13,6 @@ def segments_to_vertices_edges(segments: torch.Tensor):
     return vertices, edges
-def varifold_loss(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    sigma: float = 0.1,
-    variant: str = "semi_lobatto3",
-    t_nodes01: torch.Tensor | None = None,
-    t_w: torch.Tensor | None = None,
-    sigmas: torch.Tensor | None = None,
-    alpha: torch.Tensor | None = None,
-    normalize_alpha: bool = True,
-    len_pow: float | None = None,
-) -> torch.Tensor:
-    p_pred, q_pred = pred_segments[:, 0], pred_segments[:, 1]
-    p_gt, q_gt = gt_segments[:, 0], gt_segments[:, 1]
-    if variant == "semi_lobatto3":
-        return loss_semi_lobatto3(p_pred, q_pred, p_gt, q_gt, sigma)
-    if variant == "semi_lobatto3_mix":
-        if sigmas is None or alpha is None:
-            raise ValueError("sigmas and alpha are required for semi_lobatto3_mix")
-        return loss_semi_lobatto3_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
-    if variant == "semi_lobatto3_mix_simple":
-        if sigmas is None or alpha is None:
-            raise ValueError("sigmas and alpha are required for semi_lobatto3_mix_simple")
-        return loss_semi_lobatto3_mix_simple(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
-    if variant == "simpson3":
-        if sigmas is not None or alpha is not None:
-            if sigmas is None or alpha is None:
-                raise ValueError("sigmas and alpha are required for simpson3 mix")
-            return loss_simpson3_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
-        return loss_simpson3(p_pred, q_pred, p_gt, q_gt, sigma)
-    if variant == "simpson3_lenpow":
-        if len_pow is None:
-            len_pow = 1.0
-        if sigmas is not None or alpha is not None:
-            if sigmas is None or alpha is None:
-                raise ValueError("sigmas and alpha are required for simpson3_lenpow mix")
-            return loss_simpson3_lenpow_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, len_pow, normalize_alpha)
-        return loss_simpson3_lenpow(p_pred, q_pred, p_gt, q_gt, sigma, len_pow)
-    if variant == "semi_legendre":
-        return loss_semi_legendre(p_pred, q_pred, p_gt, q_gt, sigma, t_nodes01, t_w)
-    if variant in ("centers", "segments_varifold", "semi_lobatto1"):
-        return varifold_loss_centers(pred_segments, gt_segments, sigma)
-    raise ValueError(f"Unknown varifold variant: {variant}")
 def varifold_loss_batch(
     pred_segments: torch.Tensor,
     gt_segments: torch.Tensor,
@@ -102,95 +42,12 @@ def varifold_loss_batch(
     if pred_weights is not None:
         w_pred = pred_weights.to(device=pred_segments.device, dtype=pred_segments.dtype)
-    if variant == "simpson3":
-        if sigmas is not None or alpha is not None:
-            if sigmas is None or alpha is None:
-                raise ValueError("sigmas and alpha are required for simpson3 mix")
-            return loss_simpson3_mix_batch(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, w_gt=w_gt, w_pred=w_pred, normalize_alpha=normalize_alpha, cross_only=cross_only)
-        return loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, sigma, w_gt=w_gt, w_pred=w_pred)
-    # Fallback to per-sample loop for unsupported variants.
-    losses = []
-    sigmas_t = None
-    if sigmas is not None:
-        sigmas_t = torch.as_tensor(sigmas, device=pred_segments.device, dtype=pred_segments.dtype)
-    for idx in range(pred_segments.shape[0]):
-        gt_b = gt_segments[idx]
-        if gt_mask is not None:
-            gt_b = gt_b[gt_mask[idx]]
-        sigmas_i = sigmas
-        if sigmas_t is not None and sigmas_t.ndim == 2:
-            sigmas_i = sigmas_t[idx]
-        losses.append(
-            varifold_loss(
-                pred_segments[idx],
-                gt_b,
-                sigma=sigma,
-                variant=variant,
-                t_nodes01=t_nodes01,
-                t_w=t_w,
-                sigmas=sigmas_i,
-                alpha=alpha,
-                normalize_alpha=normalize_alpha,
-                len_pow=len_pow,
-            )
-        )
-    return torch.stack(losses, dim=0)
-def varifold_loss_centers(
-    pred_segments: torch.Tensor,
-    gt_segments: torch.Tensor,
-    sigma: float = 0.1,
-    normalize_weights: bool = True,
-) -> torch.Tensor:
-    eps = 1e-8
-    a_p, b_p = pred_segments[:, 0], pred_segments[:, 1]
-    a_g, b_g = gt_segments[:, 0], gt_segments[:, 1]
-    v_p = b_p - a_p
-    v_g = b_g - a_g
-    len_p = torch.linalg.norm(v_p, dim=-1)
-    len_g = torch.linalg.norm(v_g, dim=-1)
-    x_p = 0.5 * (a_p + b_p)
-    x_g = 0.5 * (a_g + b_g)
-    u_p = v_p / (len_p[:, None] + eps)
-    u_g = v_g / (len_g[:, None] + eps)
-    w_p = len_p
-    w_g = len_g
-    if normalize_weights:
-        w_p = w_p / (w_p.sum() + eps)
-        w_g = w_g / (w_g.sum() + eps)
-    diff_pp = x_p[:, None, :] - x_p[None, :, :]
-    diff_gg = x_g[:, None, :] - x_g[None, :, :]
-    diff_pg = x_p[:, None, :] - x_g[None, :, :]
-    d_pp = (diff_pp * diff_pp).sum(dim=-1)
-    d_gg = (diff_gg * diff_gg).sum(dim=-1)
-    d_pg = (diff_pg * diff_pg).sum(dim=-1)
-    inv2s2 = 1.0 / (2.0 * sigma * sigma)
-    k_pp = torch.exp(-d_pp * inv2s2)
-    k_gg = torch.exp(-d_gg * inv2s2)
-    k_pg = torch.exp(-d_pg * inv2s2)
-    dot_pp = (u_p[:, None, :] * u_p[None, :, :]).sum(dim=-1)
-    dot_gg = (u_g[:, None, :] * u_g[None, :, :]).sum(dim=-1)
-    dot_pg = (u_p[:, None, :] * u_g[None, :, :]).sum(dim=-1)
-    k_pp = k_pp * (dot_pp * dot_pp)
-    k_gg = k_gg * (dot_gg * dot_gg)
-    k_pg = k_pg * (dot_pg * dot_pg)
-    wp_row = w_p[:, None]
-    wp_col = w_p[None, :]
-    wg_row = w_g[:, None]
-    wg_col = w_g[None, :]
-    a_pp = (wp_row * wp_col * k_pp).sum(dim=-1).sum(dim=-1)
-    a_gg = (wg_row * wg_col * k_gg).sum(dim=-1).sum(dim=-1)
-    a_pg = (w_p[:, None] * w_g[None, :] * k_pg).sum(dim=-1).sum(dim=-1)
-    return a_pp + a_gg - 2.0 * a_pg

 import torch
 from .wire_varifold_kernels import (
     loss_simpson3_batch,
     loss_simpson3_mix_batch,
 )
 def segments_to_vertices_edges(segments: torch.Tensor):
     segs = torch.as_tensor(segments, dtype=torch.float32)
     vertices = segs.reshape(-1, 3)
     return vertices, edges
 def varifold_loss_batch(
     pred_segments: torch.Tensor,
     gt_segments: torch.Tensor,
     if pred_weights is not None:
         w_pred = pred_weights.to(device=pred_segments.device, dtype=pred_segments.dtype)
+    if variant != "simpson3":
+        raise ValueError(
+            f"Unsupported varifold variant: {variant!r}. "
+            f"Only 'simpson3' is supported in batch mode.")
+    if sigmas is not None or alpha is not None:
+        if sigmas is None or alpha is None:
+            raise ValueError("sigmas and alpha are required for simpson3 mix")
+        return loss_simpson3_mix_batch(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, w_gt=w_gt, w_pred=w_pred, normalize_alpha=normalize_alpha, cross_only=cross_only)
+    return loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, sigma, w_gt=w_gt, w_pred=w_pred)

s23dr_2026_example/wire_varifold_kernels.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import math
 import torch
 # -----------------------------
@@ -46,7 +45,7 @@ def _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha: bool):
     return sigmas_t, alpha_t
 # -----------------------------
-# 1) Simpson-3 on both segments (3x3 product rule)
 # -----------------------------
 def _prep_weight(w, n: int, b: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor | None:
     if w is None:
@@ -121,227 +120,9 @@ def cross_simpson3(
     return out[0] if not batched else out
-def cross_simpson3_lenpow(
-    pA,
-    qA,
-    pB,
-    qB,
-    sigma: float | torch.Tensor,
-    len_pow: float,
-    wA: torch.Tensor | None = None,
-    wB: torch.Tensor | None = None,
-):
-    device, dtype = pA.device, pA.dtype
-    batched = pA.dim() == 3
-    if not batched:
-        pA = pA.unsqueeze(0)
-        qA = qA.unsqueeze(0)
-        pB = pB.unsqueeze(0)
-        qB = qB.unsqueeze(0)
-    nodes = LOBATTO3_NODES.to(device=device, dtype=dtype)
-    w2 = LOBATTO3_W2.to(device=device, dtype=dtype)
-    bsz, nA, _ = pA.shape
-    nB = pB.shape[1]
-    wA = _prep_weight(wA, nA, bsz, device, dtype)
-    wB = _prep_weight(wB, nB, bsz, device, dtype)
-    _, _, ellA, uA = segment_geom(pA, qA)
-    _, _, ellB, uB = segment_geom(pB, qB)
-    XA = sample_points(pA, qA, nodes)  # (B,N,3,3)
-    YB = sample_points(pB, qB, nodes)  # (B,M,3,3)
-    ang = torch.matmul(uA, uB.transpose(-1, -2)).pow(2)
-    lenfac = (ellA[:, :, None] * ellB[:, None, :]).pow(len_pow)
-    if wA is not None or wB is not None:
-        if wA is None:
-            wA = torch.ones((bsz, nA), device=device, dtype=dtype)
-        if wB is None:
-            wB = torch.ones((bsz, nB), device=device, dtype=dtype)
-        lenfac = lenfac * (wA[:, :, None] * wB[:, None, :])
-    diff = XA[:, :, None, :, None, :] - YB[:, None, :, None, :, :]  # (B,N,M,3,3,3)
-    r2 = (diff * diff).sum(dim=-1)                                 # (B,N,M,3,3)
-    sigma_t = torch.as_tensor(sigma, device=device, dtype=dtype)
-    if sigma_t.ndim == 0:
-        inv2s2 = 1.0 / (2.0 * sigma_t * sigma_t)
-    else:
-        if sigma_t.shape[0] != bsz:
-            raise ValueError(f"sigma batch {sigma_t.shape[0]} != {bsz}")
-        inv2s2 = (1.0 / (2.0 * sigma_t * sigma_t)).view(bsz, 1, 1, 1, 1)
-    K = torch.exp(-r2 * inv2s2)                                     # (B,N,M,3,3)
-    spatial = (K * w2).sum(dim=-1).sum(dim=-1)                     # (B,N,M)
-    out = (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)         # (B,)
-    return out[0] if not batched else out
-# -----------------------------
-# 2/3) Semi-analytic in s, quadrature in t
-#     - Lobatto-3 (endpoints+midpoint)
-#     - Gauss-Legendre Q (nodes/weights passed in)
-# -----------------------------
-def cross_semi_analytic(pA, qA, pB, qB, sigma: float, t_nodes01: torch.Tensor, t_w: torch.Tensor):
-    """
-    Gaussian k_x. Integrate s exactly along A, integrate t numerically along B.
-    t_nodes01, t_w: (Q,) nodes/weights on [0,1] (constants you pass in)
-    """
-    device, dtype = pA.device, pA.dtype
-    t = t_nodes01.to(device=device, dtype=dtype)  # (Q,)
-    w = t_w.to(device=device, dtype=dtype)         # (Q,)
-    dA, aA, ellA, uA = segment_geom(pA, qA)
-    dB, _,  ellB, uB = segment_geom(pB, qB)
-    # (N,M) factors
-    ang = (uA @ uB.t()).pow(2)
-    lenfac = ellA[:, None] * ellB[None, :]
-    # r0: (N,M,3)
-    r0 = pA[:, None, :] - pB[None, :, :]
-    # r(t): (N,M,Q,3)
-    r = r0[:, :, None, :] - t[None, None, :, None] * dB[None, :, None, :]
-    # beta, r2: (N,M,Q)
-    beta = (r * dA[:, None, None, :]).sum(dim=-1)
-    r2 = (r * r).sum(dim=-1)
-    # semi-analytic constants per A segment: shapes broadcast to (N,1,1)
-    a = aA.clamp_min(1e-12)
-    inv_a = (1.0 / a).view(-1, 1, 1)
-    denom = (torch.sqrt(2.0 * a) * sigma).view(-1, 1, 1)
-    pref = (math.sqrt(math.pi) * sigma / torch.sqrt(2.0 * a)).view(-1, 1, 1)
-    # J(t): (N,M,Q)
-    exp_term = torch.exp(-(r2 - (beta * beta) * inv_a) / (2.0 * sigma * sigma))
-    erf1 = torch.special.erf((a.view(-1, 1, 1) + beta) / denom)
-    erf0 = torch.special.erf(beta / denom)
-    J = pref * (erf1 - erf0) * exp_term
-    # integrate over t: (N,M)
-    spatial = (J * w.view(1, 1, -1)).sum(dim=-1)
-    return (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)
-def cross_semi_lobatto3(pA, qA, pB, qB, sigma: float):
-    device, dtype = pA.device, pA.dtype
-    t = LOBATTO3_NODES.to(device=device, dtype=dtype)
-    w = LOBATTO3_W.to(device=device, dtype=dtype)
-    return cross_semi_analytic(pA, qA, pB, qB, sigma, t, w)
-def cross_semi_lobatto3_mix(
-    pA,
-    qA,
-    pB,
-    qB,
-    sigmas,
-    alpha,
-    normalize_alpha: bool = True,
-):
-    """
-    Semi-analytic in s (along A), Lobatto-3 in t (along B), with a sigma mixture.
-    """
-    device, dtype = pA.device, pA.dtype
-    t_nodes = LOBATTO3_NODES.to(device=device, dtype=dtype)
-    t_w = LOBATTO3_W.to(device=device, dtype=dtype)
-    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
-    dA, aA, ellA, uA = segment_geom(pA, qA)
-    dB, _, ellB, uB = segment_geom(pB, qB)
-    ang = (uA @ uB.t()).pow(2)
-    lenfac = ellA[:, None] * ellB[None, :]
-    r0 = pA[:, None, :] - pB[None, :, :]
-    a = aA.clamp_min(1e-12)
-    inv_a = (1.0 / a).view(-1, 1)
-    sqrt_a = torch.sqrt(2.0 * a).clamp_min(1e-12)
-    denom = (sqrt_a[:, None] * sigmas_t[None, :]).clamp_min(1e-12)
-    pref = math.sqrt(math.pi) * sigmas_t[None, :] / sqrt_a[:, None]
-    inv2s2 = (1.0 / (2.0 * sigmas_t * sigmas_t)).view(1, 1, -1)
-    denom_nmS = denom[:, None, :]
-    pref_nmS = pref[:, None, :]
-    alpha_nmS = alpha_t.view(1, 1, -1)
-    a_nm1 = a[:, None, None]
-    spatial = torch.zeros((pA.shape[0], pB.shape[0]), device=device, dtype=dtype)
-    for tk, wk in zip(t_nodes, t_w):
-        r = r0 - tk * dB[None, :, :]
-        beta = (r * dA[:, None, :]).sum(dim=-1)
-        r2 = (r * r).sum(dim=-1)
-        core = r2 - (beta * beta) * inv_a
-        exp_term = torch.exp(-core[:, :, None] * inv2s2)
-        erf1 = torch.special.erf((a_nm1 + beta[:, :, None]) / denom_nmS)
-        erf0 = torch.special.erf(beta[:, :, None] / denom_nmS)
-        J = pref_nmS * (erf1 - erf0) * exp_term
-        spatial = spatial + wk * (J * alpha_nmS).sum(dim=-1)
-    return (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)
 # -----------------------------
-# Full losses (self + self - 2 cross)
 # -----------------------------
-# def loss_simpson3(p_pred, q_pred, p_gt, q_gt, sigma: float):
-#     s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma)
-#     # s_gt   = cross_simpson3(p_gt,   q_gt,   p_gt,   q_gt,   sigma)
-#     cross  = cross_simpson3(p_pred, q_pred, p_gt,   q_gt,   sigma)
-#     # return s_pred + s_gt - 2.0 * cross
-#     return s_pred - 2.0 * cross
-def loss_simpson3(p_pred, q_pred, p_gt, q_gt, sigma: float):
-    s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma)
-    # s_gt   = cross_simpson3(p_gt,   q_gt,   p_gt,   q_gt,   sigma)
-    cross  = cross_simpson3(p_pred, q_pred, p_gt,   q_gt,   sigma)
-    # return s_pred + s_gt - 2.0 * cross
-    return s_pred - 2.0 * cross
-def loss_simpson3_lenpow(p_pred, q_pred, p_gt, q_gt, sigma: float, len_pow: float):
-    s_pred = cross_simpson3_lenpow(p_pred, q_pred, p_pred, q_pred, sigma, len_pow)
-    # s_gt   = cross_simpson3_lenpow(p_gt,   q_gt,   p_gt,   q_gt,   sigma, len_pow)
-    cross  = cross_simpson3_lenpow(p_pred, q_pred, p_gt,   q_gt,   sigma, len_pow)
-    # return s_pred + s_gt - 2.0 * cross
-    return s_pred - 2.0 * cross
-def loss_simpson3_mix(
-    p_pred,
-    q_pred,
-    p_gt,
-    q_gt,
-    sigmas,
-    alpha,
-    normalize_alpha: bool = True,
-):
-    device, dtype = p_pred.device, p_pred.dtype
-    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
-    losses = [loss_simpson3(p_pred, q_pred, p_gt, q_gt, s) for s in sigmas_t]
-    return (torch.stack(losses) * alpha_t).sum()
-# def loss_simpson3_batch(
-#     p_pred: torch.Tensor,
-#     q_pred: torch.Tensor,
-#     p_gt: torch.Tensor,
-#     q_gt: torch.Tensor,
-#     sigma: float | torch.Tensor,
-#     w_gt: torch.Tensor | None = None,
-# ) -> torch.Tensor:
-#     s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma)
-#     # s_gt = cross_simpson3(p_gt, q_gt, p_gt, q_gt, sigma, wA=w_gt, wB=w_gt)
-#     cross = cross_simpson3(p_pred, q_pred, p_gt, q_gt, sigma, wB=w_gt)
-#     # return s_pred + s_gt - 2.0 * cross
-#     return s_pred - 2.0 * cross
 def loss_simpson3_batch(
     p_pred: torch.Tensor,
@@ -385,77 +166,3 @@ def loss_simpson3_mix_batch(
         losses = [loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, sigmas_t[:, i], w_gt=w_gt, w_pred=w_pred, cross_only=cross_only) for i in range(sigmas_t.shape[1])]
         return (torch.stack(losses, dim=0) * alpha_t[:, None]).sum(dim=0)
     raise ValueError("sigmas must be 1D or 2D for batch loss")
-def loss_simpson3_lenpow_mix(
-    p_pred,
-    q_pred,
-    p_gt,
-    q_gt,
-    sigmas,
-    alpha,
-    len_pow: float,
-    normalize_alpha: bool = True,
-):
-    device, dtype = p_pred.device, p_pred.dtype
-    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
-    losses = [loss_simpson3_lenpow(p_pred, q_pred, p_gt, q_gt, s, len_pow) for s in sigmas_t]
-    return (torch.stack(losses) * alpha_t).sum()
-def loss_semi_lobatto3(p_pred, q_pred, p_gt, q_gt, sigma: float):
-    s_pred = cross_semi_lobatto3(p_pred, q_pred, p_pred, q_pred, sigma)
-    # s_gt   = cross_semi_lobatto3(p_gt,   q_gt,   p_gt,   q_gt,   sigma)
-    cross  = cross_semi_lobatto3(p_pred, q_pred, p_gt,   q_gt,   sigma)
-    # return s_pred + s_gt - 2.0 * cross
-    return s_pred - 2.0 * cross
-def loss_semi_lobatto3_mix(
-    p_pred,
-    q_pred,
-    p_gt,
-    q_gt,
-    sigmas,
-    alpha,
-    normalize_alpha: bool = True,
-):
-    s_pred = cross_semi_lobatto3_mix(p_pred, q_pred, p_pred, q_pred, sigmas, alpha, normalize_alpha)
-    # s_gt = cross_semi_lobatto3_mix(p_gt, q_gt, p_gt, q_gt, sigmas, alpha, normalize_alpha)
-    cross = cross_semi_lobatto3_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
-    # return s_pred + s_gt - 2.0 * cross
-    return s_pred - 2.0 * cross
-def loss_semi_lobatto3_mix_simple(
-    p_pred,
-    q_pred,
-    p_gt,
-    q_gt,
-    sigmas,
-    alpha,
-    normalize_alpha: bool = True,
-):
-    device, dtype = p_pred.device, p_pred.dtype
-    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
-    losses = [loss_semi_lobatto3(p_pred, q_pred, p_gt, q_gt, s) for s in sigmas_t]
-    return (torch.stack(losses) * alpha_t).sum()
-def loss_semi_legendre(p_pred, q_pred, p_gt, q_gt, sigma: float, t_nodes01, t_w):
-    s_pred = cross_semi_analytic(p_pred, q_pred, p_pred, q_pred, sigma, t_nodes01, t_w)
-    s_gt   = cross_semi_analytic(p_gt,   q_gt,   p_gt,   q_gt,   sigma, t_nodes01, t_w)
-    cross  = cross_semi_analytic(p_pred, q_pred, p_gt,   q_gt,   sigma, t_nodes01, t_w)
-    return s_pred + s_gt - 2.0 * cross
-# -----------------------------
-# torch.compile usage
-# -----------------------------
-# For Legendre: generate nodes/weights ONCE outside compile and pass them in.
-# Example:
-#   import numpy as np
-#   x,w = np.polynomial.legendre.leggauss(Q)
-#   t_nodes = torch.tensor(0.5*(x+1.0), device=device, dtype=dtype)
-#   t_w     = torch.tensor(0.5*w,       device=device, dtype=dtype)
-#
-# compiled_loss = torch.compile(loss_semi_lobatto3, fullgraph=True)
-# compiled_loss_leg = torch.compile(lambda pp,qp,pg,qg,s: loss_semi_legendre(pp,qp,pg,qg,s,t_nodes,t_w),
-#                                   fullgraph=True)

 import torch
 # -----------------------------
     return sigmas_t, alpha_t
 # -----------------------------
+# Simpson-3 on both segments (3x3 product rule)
 # -----------------------------
 def _prep_weight(w, n: int, b: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor | None:
     if w is None:
     return out[0] if not batched else out
 # -----------------------------
+# Batch losses
 # -----------------------------
 def loss_simpson3_batch(
     p_pred: torch.Tensor,
         losses = [loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, sigmas_t[:, i], w_gt=w_gt, w_pred=w_pred, cross_only=cross_only) for i in range(sigmas_t.shape[1])]
         return (torch.stack(losses, dim=0) * alpha_t[:, None]).sum(dim=0)
     raise ValueError("sigmas must be 1D or 2D for batch loss")

script.py CHANGED Viewed

@@ -34,14 +34,14 @@ from s23dr_2026_example.make_sampled_cache import _priority_sample
 # Tokenizer / model imports
 from s23dr_2026_example.tokenizer import EdgeDepthSequenceConfig
 from s23dr_2026_example.model import EdgeDepthSegmentsModel
-from s23dr_2026_example.segment_postprocess import merge_vertices, merge_vertices_iterative
 from s23dr_2026_example.varifold import segments_to_vertices_edges
 from s23dr_2026_example.postprocess_v2 import snap_to_point_cloud, snap_horizontal
-SEQ_LEN = 2048
-COLMAP_QUOTA = 1536
-DEPTH_QUOTA = 512
-CONF_THRESH = 0.7
 MERGE_THRESH = 0.4
 SNAP_RADIUS = 0.5

 # Tokenizer / model imports
 from s23dr_2026_example.tokenizer import EdgeDepthSequenceConfig
 from s23dr_2026_example.model import EdgeDepthSegmentsModel
+from s23dr_2026_example.segment_postprocess import merge_vertices_iterative
 from s23dr_2026_example.varifold import segments_to_vertices_edges
 from s23dr_2026_example.postprocess_v2 import snap_to_point_cloud, snap_horizontal
+SEQ_LEN = 4096
+COLMAP_QUOTA = 3072
+DEPTH_QUOTA = 1024
+CONF_THRESH = 0.5
 MERGE_THRESH = 0.4
 SNAP_RADIUS = 0.5