diff --git a/ferritin-esm/Readme.md b/ferritin-esm/Readme.md index 4301c1b1..f9adf544 100644 --- a/ferritin-esm/Readme.md +++ b/ferritin-esm/Readme.md @@ -28,6 +28,31 @@ for (name, tensor) in pth.tensor_infos() { ``` +```text +embed.weight: TensorInfo { name: "embed.weight", dtype: F32, layout: Layout { shape: [64, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/301", storage_size: 61440 } + +transformer.norm.weight: TensorInfo { name: "transformer.norm.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/300", storage_size: 960 } + +sequence_head.0.bias: TensorInfo { name: "sequence_head.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/303", storage_size: 960 } +sequence_head.0.weight: TensorInfo { name: "sequence_head.0.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/302", storage_size: 921600 } +sequence_head.2.bias: TensorInfo { name: "sequence_head.2.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/305", storage_size: 960 } +sequence_head.2.weight: TensorInfo { name: "sequence_head.2.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/304", storage_size: 960 } +sequence_head.3.bias: TensorInfo { name: "sequence_head.3.bias", dtype: F32, layout: Layout { shape: [64], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/307", storage_size: 64 } +sequence_head.3.weight: TensorInfo { name: "sequence_head.3.weight", dtype: F32, layout: Layout { shape: [64, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/306", storage_size: 61440 } + +transformer.blocks.0.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.0.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/4", storage_size: 960 } +transformer.blocks.0.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.0.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/1", storage_size: 960 } +transformer.blocks.0.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.0.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/0", storage_size: 960 } +transformer.blocks.0.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.0.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/2", storage_size: 2764800 } +transformer.blocks.0.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.0.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/5", storage_size: 921600 } +transformer.blocks.0.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.0.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/3", storage_size: 960 } +transformer.blocks.0.ffn.0.bias: TensorInfo { name: "transformer.blocks.0.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/7", storage_size: 960 } +transformer.blocks.0.ffn.0.weight: TensorInfo { name: "transformer.blocks.0.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/6", storage_size: 960 } +transformer.blocks.0.ffn.1.weight: TensorInfo { name: "transformer.blocks.0.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/8", storage_size: 4915200 } +transformer.blocks.0.ffn.3.weight: TensorInfo { name: "transformer.blocks.0.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/9", storage_size: 2457600 } +``` + + ```text embed.weight: TensorInfo { name: "embed.weight", dtype: F32, layout: Layout { shape: [64, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/301", storage_size: 61440 } sequence_head.0.bias: TensorInfo { name: "sequence_head.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/303", storage_size: 960 } @@ -56,6 +81,106 @@ transformer.blocks.1.ffn.0.bias: TensorInfo { name: "transformer.blocks.1.ffn.0. transformer.blocks.1.ffn.0.weight: TensorInfo { name: "transformer.blocks.1.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/16", storage_size: 960 } transformer.blocks.1.ffn.1.weight: TensorInfo { name: "transformer.blocks.1.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/18", storage_size: 4915200 } transformer.blocks.1.ffn.3.weight: TensorInfo { name: "transformer.blocks.1.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/19", storage_size: 2457600 } +transformer.blocks.10.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.10.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/104", storage_size: 960 } +transformer.blocks.10.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.10.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/101", storage_size: 960 } +transformer.blocks.10.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.10.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/100", storage_size: 960 } +transformer.blocks.10.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.10.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/102", storage_size: 2764800 } +transformer.blocks.10.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.10.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/105", storage_size: 921600 } +transformer.blocks.10.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.10.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/103", storage_size: 960 } +transformer.blocks.10.ffn.0.bias: TensorInfo { name: "transformer.blocks.10.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/107", storage_size: 960 } +transformer.blocks.10.ffn.0.weight: TensorInfo { name: "transformer.blocks.10.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/106", storage_size: 960 } +transformer.blocks.10.ffn.1.weight: TensorInfo { name: "transformer.blocks.10.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/108", storage_size: 4915200 } +transformer.blocks.10.ffn.3.weight: TensorInfo { name: "transformer.blocks.10.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/109", storage_size: 2457600 } +transformer.blocks.11.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.11.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/114", storage_size: 960 } +transformer.blocks.11.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.11.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/111", storage_size: 960 } +transformer.blocks.11.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.11.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/110", storage_size: 960 } +transformer.blocks.11.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.11.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/112", storage_size: 2764800 } +transformer.blocks.11.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.11.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/115", storage_size: 921600 } +transformer.blocks.11.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.11.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/113", storage_size: 960 } +transformer.blocks.11.ffn.0.bias: TensorInfo { name: "transformer.blocks.11.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/117", storage_size: 960 } +transformer.blocks.11.ffn.0.weight: TensorInfo { name: "transformer.blocks.11.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/116", storage_size: 960 } +transformer.blocks.11.ffn.1.weight: TensorInfo { name: "transformer.blocks.11.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/118", storage_size: 4915200 } +transformer.blocks.11.ffn.3.weight: TensorInfo { name: "transformer.blocks.11.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/119", storage_size: 2457600 } +transformer.blocks.12.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.12.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/124", storage_size: 960 } +transformer.blocks.12.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.12.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/121", storage_size: 960 } +transformer.blocks.12.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.12.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/120", storage_size: 960 } +transformer.blocks.12.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.12.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/122", storage_size: 2764800 } +transformer.blocks.12.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.12.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/125", storage_size: 921600 } +transformer.blocks.12.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.12.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/123", storage_size: 960 } +transformer.blocks.12.ffn.0.bias: TensorInfo { name: "transformer.blocks.12.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/127", storage_size: 960 } +transformer.blocks.12.ffn.0.weight: TensorInfo { name: "transformer.blocks.12.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/126", storage_size: 960 } +transformer.blocks.12.ffn.1.weight: TensorInfo { name: "transformer.blocks.12.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/128", storage_size: 4915200 } +transformer.blocks.12.ffn.3.weight: TensorInfo { name: "transformer.blocks.12.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/129", storage_size: 2457600 } +transformer.blocks.13.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.13.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/134", storage_size: 960 } +transformer.blocks.13.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.13.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/131", storage_size: 960 } +transformer.blocks.13.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.13.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/130", storage_size: 960 } +transformer.blocks.13.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.13.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/132", storage_size: 2764800 } +transformer.blocks.13.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.13.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/135", storage_size: 921600 } +transformer.blocks.13.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.13.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/133", storage_size: 960 } +transformer.blocks.13.ffn.0.bias: TensorInfo { name: "transformer.blocks.13.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/137", storage_size: 960 } +transformer.blocks.13.ffn.0.weight: TensorInfo { name: "transformer.blocks.13.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/136", storage_size: 960 } +transformer.blocks.13.ffn.1.weight: TensorInfo { name: "transformer.blocks.13.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/138", storage_size: 4915200 } +transformer.blocks.13.ffn.3.weight: TensorInfo { name: "transformer.blocks.13.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/139", storage_size: 2457600 } +transformer.blocks.14.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.14.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/144", storage_size: 960 } +transformer.blocks.14.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.14.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/141", storage_size: 960 } +transformer.blocks.14.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.14.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/140", storage_size: 960 } +transformer.blocks.14.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.14.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/142", storage_size: 2764800 } +transformer.blocks.14.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.14.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/145", storage_size: 921600 } +transformer.blocks.14.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.14.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/143", storage_size: 960 } +transformer.blocks.14.ffn.0.bias: TensorInfo { name: "transformer.blocks.14.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/147", storage_size: 960 } +transformer.blocks.14.ffn.0.weight: TensorInfo { name: "transformer.blocks.14.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/146", storage_size: 960 } +transformer.blocks.14.ffn.1.weight: TensorInfo { name: "transformer.blocks.14.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/148", storage_size: 4915200 } +transformer.blocks.14.ffn.3.weight: TensorInfo { name: "transformer.blocks.14.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/149", storage_size: 2457600 } +transformer.blocks.15.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.15.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/154", storage_size: 960 } +transformer.blocks.15.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.15.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/151", storage_size: 960 } +transformer.blocks.15.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.15.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/150", storage_size: 960 } +transformer.blocks.15.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.15.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/152", storage_size: 2764800 } +transformer.blocks.15.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.15.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/155", storage_size: 921600 } +transformer.blocks.15.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.15.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/153", storage_size: 960 } +transformer.blocks.15.ffn.0.bias: TensorInfo { name: "transformer.blocks.15.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/157", storage_size: 960 } +transformer.blocks.15.ffn.0.weight: TensorInfo { name: "transformer.blocks.15.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/156", storage_size: 960 } +transformer.blocks.15.ffn.1.weight: TensorInfo { name: "transformer.blocks.15.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/158", storage_size: 4915200 } +transformer.blocks.15.ffn.3.weight: TensorInfo { name: "transformer.blocks.15.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/159", storage_size: 2457600 } +transformer.blocks.16.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.16.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/164", storage_size: 960 } +transformer.blocks.16.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.16.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/161", storage_size: 960 } +transformer.blocks.16.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.16.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/160", storage_size: 960 } +transformer.blocks.16.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.16.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/162", storage_size: 2764800 } +transformer.blocks.16.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.16.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/165", storage_size: 921600 } +transformer.blocks.16.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.16.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/163", storage_size: 960 } +transformer.blocks.16.ffn.0.bias: TensorInfo { name: "transformer.blocks.16.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/167", storage_size: 960 } +transformer.blocks.16.ffn.0.weight: TensorInfo { name: "transformer.blocks.16.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/166", storage_size: 960 } +transformer.blocks.16.ffn.1.weight: TensorInfo { name: "transformer.blocks.16.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/168", storage_size: 4915200 } +transformer.blocks.16.ffn.3.weight: TensorInfo { name: "transformer.blocks.16.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/169", storage_size: 2457600 } +transformer.blocks.17.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.17.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/174", storage_size: 960 } +transformer.blocks.17.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.17.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/171", storage_size: 960 } +transformer.blocks.17.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.17.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/170", storage_size: 960 } +transformer.blocks.17.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.17.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/172", storage_size: 2764800 } +transformer.blocks.17.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.17.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/175", storage_size: 921600 } +transformer.blocks.17.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.17.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/173", storage_size: 960 } +transformer.blocks.17.ffn.0.bias: TensorInfo { name: "transformer.blocks.17.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/177", storage_size: 960 } +transformer.blocks.17.ffn.0.weight: TensorInfo { name: "transformer.blocks.17.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/176", storage_size: 960 } +transformer.blocks.17.ffn.1.weight: TensorInfo { name: "transformer.blocks.17.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/178", storage_size: 4915200 } +transformer.blocks.17.ffn.3.weight: TensorInfo { name: "transformer.blocks.17.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/179", storage_size: 2457600 } +transformer.blocks.18.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.18.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/184", storage_size: 960 } +transformer.blocks.18.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.18.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/181", storage_size: 960 } +transformer.blocks.18.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.18.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/180", storage_size: 960 } +transformer.blocks.18.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.18.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/182", storage_size: 2764800 } +transformer.blocks.18.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.18.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/185", storage_size: 921600 } +transformer.blocks.18.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.18.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/183", storage_size: 960 } +transformer.blocks.18.ffn.0.bias: TensorInfo { name: "transformer.blocks.18.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/187", storage_size: 960 } +transformer.blocks.18.ffn.0.weight: TensorInfo { name: "transformer.blocks.18.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/186", storage_size: 960 } +transformer.blocks.18.ffn.1.weight: TensorInfo { name: "transformer.blocks.18.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/188", storage_size: 4915200 } +transformer.blocks.18.ffn.3.weight: TensorInfo { name: "transformer.blocks.18.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/189", storage_size: 2457600 } +transformer.blocks.19.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.19.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/194", storage_size: 960 } +transformer.blocks.19.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.19.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/191", storage_size: 960 } +transformer.blocks.19.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.19.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/190", storage_size: 960 } +transformer.blocks.19.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.19.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/192", storage_size: 2764800 } +transformer.blocks.19.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.19.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/195", storage_size: 921600 } +transformer.blocks.19.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.19.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/193", storage_size: 960 } +transformer.blocks.19.ffn.0.bias: TensorInfo { name: "transformer.blocks.19.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/197", storage_size: 960 } +transformer.blocks.19.ffn.0.weight: TensorInfo { name: "transformer.blocks.19.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/196", storage_size: 960 } +transformer.blocks.19.ffn.1.weight: TensorInfo { name: "transformer.blocks.19.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/198", storage_size: 4915200 } +transformer.blocks.19.ffn.3.weight: TensorInfo { name: "transformer.blocks.19.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/199", storage_size: 2457600 } transformer.blocks.2.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.2.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/24", storage_size: 960 } transformer.blocks.2.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.2.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/21", storage_size: 960 } transformer.blocks.2.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.2.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/20", storage_size: 960 } @@ -66,6 +191,106 @@ transformer.blocks.2.ffn.0.bias: TensorInfo { name: "transformer.blocks.2.ffn.0. transformer.blocks.2.ffn.0.weight: TensorInfo { name: "transformer.blocks.2.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/26", storage_size: 960 } transformer.blocks.2.ffn.1.weight: TensorInfo { name: "transformer.blocks.2.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/28", storage_size: 4915200 } transformer.blocks.2.ffn.3.weight: TensorInfo { name: "transformer.blocks.2.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/29", storage_size: 2457600 } +transformer.blocks.20.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.20.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/204", storage_size: 960 } +transformer.blocks.20.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.20.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/201", storage_size: 960 } +transformer.blocks.20.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.20.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/200", storage_size: 960 } +transformer.blocks.20.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.20.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/202", storage_size: 2764800 } +transformer.blocks.20.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.20.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/205", storage_size: 921600 } +transformer.blocks.20.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.20.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/203", storage_size: 960 } +transformer.blocks.20.ffn.0.bias: TensorInfo { name: "transformer.blocks.20.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/207", storage_size: 960 } +transformer.blocks.20.ffn.0.weight: TensorInfo { name: "transformer.blocks.20.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/206", storage_size: 960 } +transformer.blocks.20.ffn.1.weight: TensorInfo { name: "transformer.blocks.20.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/208", storage_size: 4915200 } +transformer.blocks.20.ffn.3.weight: TensorInfo { name: "transformer.blocks.20.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/209", storage_size: 2457600 } +transformer.blocks.21.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.21.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/214", storage_size: 960 } +transformer.blocks.21.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.21.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/211", storage_size: 960 } +transformer.blocks.21.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.21.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/210", storage_size: 960 } +transformer.blocks.21.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.21.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/212", storage_size: 2764800 } +transformer.blocks.21.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.21.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/215", storage_size: 921600 } +transformer.blocks.21.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.21.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/213", storage_size: 960 } +transformer.blocks.21.ffn.0.bias: TensorInfo { name: "transformer.blocks.21.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/217", storage_size: 960 } +transformer.blocks.21.ffn.0.weight: TensorInfo { name: "transformer.blocks.21.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/216", storage_size: 960 } +transformer.blocks.21.ffn.1.weight: TensorInfo { name: "transformer.blocks.21.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/218", storage_size: 4915200 } +transformer.blocks.21.ffn.3.weight: TensorInfo { name: "transformer.blocks.21.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/219", storage_size: 2457600 } +transformer.blocks.22.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.22.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/224", storage_size: 960 } +transformer.blocks.22.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.22.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/221", storage_size: 960 } +transformer.blocks.22.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.22.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/220", storage_size: 960 } +transformer.blocks.22.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.22.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/222", storage_size: 2764800 } +transformer.blocks.22.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.22.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/225", storage_size: 921600 } +transformer.blocks.22.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.22.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/223", storage_size: 960 } +transformer.blocks.22.ffn.0.bias: TensorInfo { name: "transformer.blocks.22.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/227", storage_size: 960 } +transformer.blocks.22.ffn.0.weight: TensorInfo { name: "transformer.blocks.22.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/226", storage_size: 960 } +transformer.blocks.22.ffn.1.weight: TensorInfo { name: "transformer.blocks.22.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/228", storage_size: 4915200 } +transformer.blocks.22.ffn.3.weight: TensorInfo { name: "transformer.blocks.22.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/229", storage_size: 2457600 } +transformer.blocks.23.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.23.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/234", storage_size: 960 } +transformer.blocks.23.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.23.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/231", storage_size: 960 } +transformer.blocks.23.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.23.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/230", storage_size: 960 } +transformer.blocks.23.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.23.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/232", storage_size: 2764800 } +transformer.blocks.23.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.23.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/235", storage_size: 921600 } +transformer.blocks.23.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.23.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/233", storage_size: 960 } +transformer.blocks.23.ffn.0.bias: TensorInfo { name: "transformer.blocks.23.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/237", storage_size: 960 } +transformer.blocks.23.ffn.0.weight: TensorInfo { name: "transformer.blocks.23.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/236", storage_size: 960 } +transformer.blocks.23.ffn.1.weight: TensorInfo { name: "transformer.blocks.23.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/238", storage_size: 4915200 } +transformer.blocks.23.ffn.3.weight: TensorInfo { name: "transformer.blocks.23.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/239", storage_size: 2457600 } +transformer.blocks.24.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.24.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/244", storage_size: 960 } +transformer.blocks.24.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.24.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/241", storage_size: 960 } +transformer.blocks.24.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.24.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/240", storage_size: 960 } +transformer.blocks.24.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.24.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/242", storage_size: 2764800 } +transformer.blocks.24.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.24.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/245", storage_size: 921600 } +transformer.blocks.24.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.24.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/243", storage_size: 960 } +transformer.blocks.24.ffn.0.bias: TensorInfo { name: "transformer.blocks.24.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/247", storage_size: 960 } +transformer.blocks.24.ffn.0.weight: TensorInfo { name: "transformer.blocks.24.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/246", storage_size: 960 } +transformer.blocks.24.ffn.1.weight: TensorInfo { name: "transformer.blocks.24.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/248", storage_size: 4915200 } +transformer.blocks.24.ffn.3.weight: TensorInfo { name: "transformer.blocks.24.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/249", storage_size: 2457600 } +transformer.blocks.25.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.25.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/254", storage_size: 960 } +transformer.blocks.25.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.25.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/251", storage_size: 960 } +transformer.blocks.25.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.25.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/250", storage_size: 960 } +transformer.blocks.25.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.25.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/252", storage_size: 2764800 } +transformer.blocks.25.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.25.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/255", storage_size: 921600 } +transformer.blocks.25.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.25.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/253", storage_size: 960 } +transformer.blocks.25.ffn.0.bias: TensorInfo { name: "transformer.blocks.25.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/257", storage_size: 960 } +transformer.blocks.25.ffn.0.weight: TensorInfo { name: "transformer.blocks.25.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/256", storage_size: 960 } +transformer.blocks.25.ffn.1.weight: TensorInfo { name: "transformer.blocks.25.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/258", storage_size: 4915200 } +transformer.blocks.25.ffn.3.weight: TensorInfo { name: "transformer.blocks.25.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/259", storage_size: 2457600 } +transformer.blocks.26.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.26.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/264", storage_size: 960 } +transformer.blocks.26.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.26.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/261", storage_size: 960 } +transformer.blocks.26.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.26.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/260", storage_size: 960 } +transformer.blocks.26.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.26.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/262", storage_size: 2764800 } +transformer.blocks.26.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.26.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/265", storage_size: 921600 } +transformer.blocks.26.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.26.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/263", storage_size: 960 } +transformer.blocks.26.ffn.0.bias: TensorInfo { name: "transformer.blocks.26.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/267", storage_size: 960 } +transformer.blocks.26.ffn.0.weight: TensorInfo { name: "transformer.blocks.26.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/266", storage_size: 960 } +transformer.blocks.26.ffn.1.weight: TensorInfo { name: "transformer.blocks.26.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/268", storage_size: 4915200 } +transformer.blocks.26.ffn.3.weight: TensorInfo { name: "transformer.blocks.26.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/269", storage_size: 2457600 } +transformer.blocks.27.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.27.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/274", storage_size: 960 } +transformer.blocks.27.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.27.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/271", storage_size: 960 } +transformer.blocks.27.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.27.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/270", storage_size: 960 } +transformer.blocks.27.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.27.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/272", storage_size: 2764800 } +transformer.blocks.27.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.27.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/275", storage_size: 921600 } +transformer.blocks.27.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.27.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/273", storage_size: 960 } +transformer.blocks.27.ffn.0.bias: TensorInfo { name: "transformer.blocks.27.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/277", storage_size: 960 } +transformer.blocks.27.ffn.0.weight: TensorInfo { name: "transformer.blocks.27.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/276", storage_size: 960 } +transformer.blocks.27.ffn.1.weight: TensorInfo { name: "transformer.blocks.27.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/278", storage_size: 4915200 } +transformer.blocks.27.ffn.3.weight: TensorInfo { name: "transformer.blocks.27.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/279", storage_size: 2457600 } +transformer.blocks.28.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.28.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/284", storage_size: 960 } +transformer.blocks.28.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.28.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/281", storage_size: 960 } +transformer.blocks.28.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.28.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/280", storage_size: 960 } +transformer.blocks.28.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.28.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/282", storage_size: 2764800 } +transformer.blocks.28.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.28.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/285", storage_size: 921600 } +transformer.blocks.28.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.28.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/283", storage_size: 960 } +transformer.blocks.28.ffn.0.bias: TensorInfo { name: "transformer.blocks.28.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/287", storage_size: 960 } +transformer.blocks.28.ffn.0.weight: TensorInfo { name: "transformer.blocks.28.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/286", storage_size: 960 } +transformer.blocks.28.ffn.1.weight: TensorInfo { name: "transformer.blocks.28.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/288", storage_size: 4915200 } +transformer.blocks.28.ffn.3.weight: TensorInfo { name: "transformer.blocks.28.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/289", storage_size: 2457600 } +transformer.blocks.29.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.29.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/294", storage_size: 960 } +transformer.blocks.29.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.29.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/291", storage_size: 960 } +transformer.blocks.29.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.29.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/290", storage_size: 960 } +transformer.blocks.29.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.29.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/292", storage_size: 2764800 } +transformer.blocks.29.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.29.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/295", storage_size: 921600 } +transformer.blocks.29.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.29.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/293", storage_size: 960 } +transformer.blocks.29.ffn.0.bias: TensorInfo { name: "transformer.blocks.29.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/297", storage_size: 960 } +transformer.blocks.29.ffn.0.weight: TensorInfo { name: "transformer.blocks.29.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/296", storage_size: 960 } +transformer.blocks.29.ffn.1.weight: TensorInfo { name: "transformer.blocks.29.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/298", storage_size: 4915200 } +transformer.blocks.29.ffn.3.weight: TensorInfo { name: "transformer.blocks.29.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/299", storage_size: 2457600 } transformer.blocks.3.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.3.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/34", storage_size: 960 } transformer.blocks.3.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.3.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/31", storage_size: 960 } transformer.blocks.3.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.3.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/30", storage_size: 960 } @@ -114,4 +339,28 @@ transformer.blocks.7.attn.out_proj.weight: TensorInfo { name: "transformer.block transformer.blocks.7.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.7.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/73", storage_size: 960 } transformer.blocks.7.ffn.0.bias: TensorInfo { name: "transformer.blocks.7.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/77", storage_size: 960 } transformer.blocks.7.ffn.0.weight: TensorInfo { name: "transformer.blocks.7.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/76", storage_size: 960 } -transformer.blocks.7.ffn.1.weight: TensorInfo { name: "transformer.blocks.7. +transformer.blocks.7.ffn.1.weight: TensorInfo { name: "transformer.blocks.7.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/78", storage_size: 4915200 } +transformer.blocks.7.ffn.3.weight: TensorInfo { name: "transformer.blocks.7.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/79", storage_size: 2457600 } +transformer.blocks.8.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.8.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/84", storage_size: 960 } +transformer.blocks.8.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.8.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/81", storage_size: 960 } +transformer.blocks.8.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.8.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/80", storage_size: 960 } +transformer.blocks.8.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.8.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/82", storage_size: 2764800 } +transformer.blocks.8.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.8.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/85", storage_size: 921600 } +transformer.blocks.8.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.8.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/83", storage_size: 960 } +transformer.blocks.8.ffn.0.bias: TensorInfo { name: "transformer.blocks.8.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/87", storage_size: 960 } +transformer.blocks.8.ffn.0.weight: TensorInfo { name: "transformer.blocks.8.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/86", storage_size: 960 } +transformer.blocks.8.ffn.1.weight: TensorInfo { name: "transformer.blocks.8.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/88", storage_size: 4915200 } +transformer.blocks.8.ffn.3.weight: TensorInfo { name: "transformer.blocks.8.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/89", storage_size: 2457600 } +transformer.blocks.9.attn.k_ln.weight: TensorInfo { name: "transformer.blocks.9.attn.k_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/94", storage_size: 960 } +transformer.blocks.9.attn.layernorm_qkv.0.bias: TensorInfo { name: "transformer.blocks.9.attn.layernorm_qkv.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/91", storage_size: 960 } +transformer.blocks.9.attn.layernorm_qkv.0.weight: TensorInfo { name: "transformer.blocks.9.attn.layernorm_qkv.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/90", storage_size: 960 } +transformer.blocks.9.attn.layernorm_qkv.1.weight: TensorInfo { name: "transformer.blocks.9.attn.layernorm_qkv.1.weight", dtype: F32, layout: Layout { shape: [2880, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/92", storage_size: 2764800 } +transformer.blocks.9.attn.out_proj.weight: TensorInfo { name: "transformer.blocks.9.attn.out_proj.weight", dtype: F32, layout: Layout { shape: [960, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/95", storage_size: 921600 } +transformer.blocks.9.attn.q_ln.weight: TensorInfo { name: "transformer.blocks.9.attn.q_ln.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/93", storage_size: 960 } +transformer.blocks.9.ffn.0.bias: TensorInfo { name: "transformer.blocks.9.ffn.0.bias", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/97", storage_size: 960 } +transformer.blocks.9.ffn.0.weight: TensorInfo { name: "transformer.blocks.9.ffn.0.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/96", storage_size: 960 } +transformer.blocks.9.ffn.1.weight: TensorInfo { name: "transformer.blocks.9.ffn.1.weight", dtype: F32, layout: Layout { shape: [5120, 960], stride: [960, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/98", storage_size: 4915200 } +transformer.blocks.9.ffn.3.weight: TensorInfo { name: "transformer.blocks.9.ffn.3.weight", dtype: F32, layout: Layout { shape: [960, 2560], stride: [2560, 1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/99", storage_size: 2457600 } + +transformer.norm.weight: TensorInfo { name: "transformer.norm.weight", dtype: F32, layout: Layout { shape: [960], stride: [1], start_offset: 0 }, path: "esmc_300m_v0_fp32/data/300", storage_size: 960 } +``` diff --git a/ferritin-esm/examples/esmc/main.rs b/ferritin-esm/examples/esmc/main.rs index 04afd948..9b4f128f 100644 --- a/ferritin-esm/examples/esmc/main.rs +++ b/ferritin-esm/examples/esmc/main.rs @@ -4,7 +4,6 @@ use candle_core::{DType, Device, D}; use candle_hf_hub::{api::sync::Api, Repo, RepoType}; use candle_nn::VarBuilder; use ferritin_esm::{ESMCConfig, ESMC}; -use safetensors::SafeTensors; // pub fn esmc_300m_202412(device: &Device) -> Result> { // let tokenizer = get_model_tokenizers(ESM3_OPEN_SMALL)?.sequence; @@ -38,8 +37,15 @@ fn main() -> Result<()> { let vb = VarBuilder::from_backend(Box::new(pth), DType::F32, Device::Cpu); let config = ESMCConfig::esmc_300m(); - let esmc = ESMC::load(vb, config); + let esmc = ESMC::load(vb.clone(), config)?; + // println!("ESMC Loaded: {}", esmc); + + // Error: cannot find tensor transformer.layer.attention.layer_norm.weight + + println!( + "VB: {}", + vb.contains_tensor("transformer.blocks.6.attn.layernorm_qkv.1.weight") + ); - println!("ESMC Loaded"); Ok(()) } diff --git a/ferritin-esm/src/esm/layers/attention.rs b/ferritin-esm/src/esm/layers/attention.rs index c2651f7b..206edf1f 100644 --- a/ferritin-esm/src/esm/layers/attention.rs +++ b/ferritin-esm/src/esm/layers/attention.rs @@ -55,16 +55,21 @@ impl MultiHeadAttention { } = config; let d_head = d_model / n_heads; - let ln_conf = LayerNormConfig::from(1e-5); - let layernorm = nn::layer_norm(*d_model, ln_conf, vb.pp("layer_norm"))?; - let linear = nn::linear(*d_model, d_model * 3, vb.pp("linear1"))?; + // let ln_conf = LayerNormConfig::from(1e-5); + let ln_conf = LayerNormConfig { + eps: 1e-5, + remove_mean: true, + affine: false, + }; + let layernorm = nn::layer_norm(*d_model, ln_conf, vb.pp("layernorm_qkv.0"))?; + let linear = nn::linear_no_bias(*d_model, d_model * 3, vb.pp("layernorm_qkv.1"))?; let layernorm_qkv = nn::seq().add(layernorm).add(linear); - let out_proj = nn::linear(*d_model, *d_model, vb.pp("out_proj"))?; - + let out_proj = nn::linear_no_bias(*d_model, *d_model, vb.pp("out_proj"))?; // note: only handling the True case for the moment // let qk_layernorm = true let q_ln = Box::new(nn::layer_norm(*d_model, ln_conf, vb.pp("q_ln"))?); let k_ln = Box::new(nn::layer_norm(*d_model, ln_conf, vb.pp("k_ln"))?); + let rotary = RotaryEmbedding::load(vb.pp("rotary"), config)?; Ok(Self { diff --git a/ferritin-esm/src/esm/layers/blocks.rs b/ferritin-esm/src/esm/layers/blocks.rs index c5659de6..33c66802 100644 --- a/ferritin-esm/src/esm/layers/blocks.rs +++ b/ferritin-esm/src/esm/layers/blocks.rs @@ -1,7 +1,7 @@ use super::attention::MultiHeadAttention; use super::geom_attention::GeometricReasoningOriginalImpl; use crate::esm::models::esmc::{ESMCConfig, Ffn_Type}; -use crate::esm::utils::structure::affine3d::Affine3D; +// use crate::esm::utils::structure::affine3d::Affine3D; use candle_core::{Module, Result, Tensor, D}; use candle_nn::ops::silu; use candle_nn::{self as nn, VarBuilder}; @@ -29,9 +29,9 @@ impl SwiGLU { let hidden_dim = Self::swiglu_correction_fn(*expansion_ratio, *d_model); Ok(Self { - layer_norm: nn::layer_norm(*d_model, 1e-5, vb.pp("layer_norm"))?, - linear1: nn::linear(*d_model, hidden_dim * 2, vb.pp("linear1"))?, - linear2: nn::linear(hidden_dim, *d_model, vb.pp("linear2"))?, + layer_norm: nn::layer_norm(*d_model, 1e-5, vb.pp("0"))?, + linear1: nn::linear_no_bias(*d_model, hidden_dim * 2, vb.pp("1"))?, + linear2: nn::linear_no_bias(hidden_dim, *d_model, vb.pp("3"))?, }) } } @@ -131,29 +131,33 @@ impl UnifiedTransformerBlock { pub fn load(vb: VarBuilder, config: &ESMCConfig, layer: usize) -> Result { let ESMCConfig { ffn_type, + v_head_transformer, use_plain_attn, n_layers_geom, residue_scaling_factor, .. } = config; - let use_geom_attn: bool = layer < *n_layers_geom; - let attn = match use_plain_attn { false => None, - true => Some(MultiHeadAttention::load(vb.pp("attention"), config)?), + true => Some(MultiHeadAttention::load(vb.pp("attn"), config)?), }; - let geom_attn = match use_geom_attn { - false => None, - true => Some(GeometricReasoningOriginalImpl::load( - vb.pp("geometric"), - config, - )?), - }; + // println!("LAYER; GEOM: {}, {}", layer, n_layers_geom); + let use_geom_attn: bool = layer < *n_layers_geom; + // println!("Geom ATTN {}", use_geom_attn); + // let geom_attn = match use_geom_attn { + // false => None, + // true => Some(GeometricReasoningOriginalImpl::load( + // vb.pp("geometric"), + // config, + // )?), + // }; + + let geom_attn = None; let ffn = match ffn_type { - Ffn_Type::SWIGLU => SwiGLU::load(vb.pp("swiglue"), config)?, + Ffn_Type::SWIGLU => SwiGLU::load(vb.pp("ffn"), config)?, _ => unimplemented!(), // Ffn_Type::GLU => unimplemented!(), }; diff --git a/ferritin-esm/src/esm/layers/geom_attention.rs b/ferritin-esm/src/esm/layers/geom_attention.rs index 2e953c94..6dc30c23 100644 --- a/ferritin-esm/src/esm/layers/geom_attention.rs +++ b/ferritin-esm/src/esm/layers/geom_attention.rs @@ -48,7 +48,9 @@ impl GeometricReasoningOriginalImpl { } = config; let num_vector_messages = 1usize; - let v_heads = v_head_transformer.unwrap(); + + // todo: this is a hidden param. Needs to be fixed + let v_heads = v_head_transformer.unwrap_or(128); let dim_proj = 4 * v_heads * 3 + v_heads * 3 * num_vector_messages; let channels_out = v_heads * 3 * num_vector_messages; diff --git a/ferritin-esm/src/esm/layers/regression_head.rs b/ferritin-esm/src/esm/layers/regression_head.rs index 114146f5..41e1b8b9 100644 --- a/ferritin-esm/src/esm/layers/regression_head.rs +++ b/ferritin-esm/src/esm/layers/regression_head.rs @@ -27,18 +27,14 @@ impl RegressionHead { .. } = config; - let linear1 = nn::linear( - *d_model, - *regression_head_hidden_dim, - vb.pp("regression_linear"), - )?; + let linear1 = nn::linear(*d_model, *regression_head_hidden_dim, vb.pp("0"))?; let gelu = candle_nn::Activation::Gelu; let ln_conf = LayerNormConfig::from(1e-5); - let norm = nn::layer_norm(*regression_head_hidden_dim, ln_conf, vb.pp("layer_norm"))?; + let norm = nn::layer_norm(*regression_head_hidden_dim, ln_conf, vb.pp("2"))?; let linear2 = nn::linear( *regression_head_hidden_dim, *regression_head_output_dim, - vb.pp("linear2"), + vb.pp("3"), )?; let model = nn::seq().add(linear1).add(gelu).add(norm).add(linear2); diff --git a/ferritin-esm/src/esm/layers/transformer_stack.rs b/ferritin-esm/src/esm/layers/transformer_stack.rs index 04a83912..d4375b16 100644 --- a/ferritin-esm/src/esm/layers/transformer_stack.rs +++ b/ferritin-esm/src/esm/layers/transformer_stack.rs @@ -1,6 +1,6 @@ use crate::esm::layers::blocks::UnifiedTransformerBlock; use crate::esm::models::esmc::ESMCConfig; -use crate::esm::utils::structure::affine3d::Affine3D; +// use crate::esm::utils::structure::affine3d::Affine3D; use candle_core::{Module, Result, Tensor, D}; use candle_nn::{self as nn, LayerNorm, LayerNormConfig}; @@ -31,11 +31,20 @@ impl TransformerStack { let mut blocks = Vec::with_capacity(*n_layers as usize); for i in 0..*n_layers { - blocks.push(UnifiedTransformerBlock::load(vb.pp("layer"), &config, i)?); + blocks.push(UnifiedTransformerBlock::load( + vb.pp(format!("blocks.{}", i)), + &config, + i, + )?); } - let ln_conf = LayerNormConfig::from(1e-5); - let norm = nn::layer_norm(*d_model, ln_conf, vb.pp("layer_norm"))?; + // let ln_conf = LayerNormConfig::from(1e-5); + let ln_conf = LayerNormConfig { + eps: 1e-5, + remove_mean: true, + affine: false, + }; + let norm = nn::layer_norm(*d_model, ln_conf, vb.pp("norm"))?; Ok(Self { blocks, norm }) } diff --git a/ferritin-esm/src/esm/models/esmc.rs b/ferritin-esm/src/esm/models/esmc.rs index 1b3eba0c..ff29e486 100644 --- a/ferritin-esm/src/esm/models/esmc.rs +++ b/ferritin-esm/src/esm/models/esmc.rs @@ -122,30 +122,17 @@ impl ESMC { pub fn load(vb: VarBuilder, config: ESMCConfig) -> Result { let ESMCConfig { d_model, - n_heads, - n_layers, - v_head_transformer, - ffn_type, tokenizer, - use_plain_attn, - n_layers_geom, - scale_residue, - residue_scaling_factor, - mask_and_zero_frameless, - bias, - qk_layernorm, - expansion_ratio, - regression_head_output_dim - regression_head_hidden_dim, embedding_dim, + .. } = config; let tokenizer_collection = tokenizer.get_model_tokenizers(); Ok(Self { - embed: nn::embedding(embedding_dim, d_model as usize, vb.pp("embedding"))?, + embed: nn::embedding(embedding_dim, d_model as usize, vb.pp("embed"))?, transformer: TransformerStack::load(vb.pp("transformer"), &config)?, - sequence_head: RegressionHead::load(vb.pp("regression"), &config)?, + sequence_head: RegressionHead::load(vb.pp("sequence_head"), &config)?, tokenizer: tokenizer_collection.sequence, }) } diff --git a/ferritin-esm/src/esm/utils/structure/affine3D.rs b/ferritin-esm/src/esm/utils/structure/affine3D.rs index c3429fe4..d79783e6 100644 --- a/ferritin-esm/src/esm/utils/structure/affine3D.rs +++ b/ferritin-esm/src/esm/utils/structure/affine3D.rs @@ -24,7 +24,8 @@ pub trait Rotation: Sized + Clone { } fn requires_grad(&self) -> bool { - self.tensor().requires_grad() + // self.tensor().requires_grad() + unimplemented!() } fn to_dtype(&self, dtype: DType) -> Result; diff --git a/ferritin-esm/src/esm/utils/structure/mod.rs b/ferritin-esm/src/esm/utils/structure/mod.rs index efd3860b..8cd74c91 100644 --- a/ferritin-esm/src/esm/utils/structure/mod.rs +++ b/ferritin-esm/src/esm/utils/structure/mod.rs @@ -1,4 +1,4 @@ pub mod affine3d; -mod protein_chain; -mod protein_complex; -mod protein_structure; +// mod protein_chain; +// mod protein_complex; +// mod protein_structure; diff --git a/justfile b/justfile index 9c90ca76..c022372d 100644 --- a/justfile +++ b/justfile @@ -45,4 +45,5 @@ test-ligandmpnn: cargo test --features metal -p ferritin-ligandmpnn test_cli_command_run_example_06 -- --nocapture esmc: + #RUST_BACKTRACE=1 cargo run --example esmc cargo run --example esmc