diff --git a/tests/unit/maxtext_utils_test.py b/tests/unit/maxtext_utils_test.py index a23e8bad17..2c9710dd77 100644 --- a/tests/unit/maxtext_utils_test.py +++ b/tests/unit/maxtext_utils_test.py @@ -827,5 +827,37 @@ def test_wsd_schedule(self): self.assertIn("wsd_decay_steps_fraction", str(cm.exception)) +class TestGetAbstractState(unittest.TestCase): + """Test class for get_abstract_state.""" + + def setUp(self): + self.config = pyconfig.initialize( + [None, get_test_config_path()], + enable_checkpointing=False, + model_name="llama3.1-8b", + per_device_batch_size=1, + max_target_length=16, + ) + devices_array = maxtext_utils.create_device_mesh(self.config) + self.mesh = Mesh(devices_array, self.config.mesh_axes) + quant = quantizations.configure_quantization(self.config) + self.model = Transformer(self.config, mesh=self.mesh, quant=quant, model_mode=MODEL_MODE_TRAIN) + self.rng = jax.random.PRNGKey(0) + self.tx = optax.adam(learning_rate=0.001) + + def test_get_abstract_state(self): + """Tests that get_abstract_state returns abstract arrays.""" + # get_abstract_state returns a tuple, the first element is the abstract state. + abstract_state, _, _ = maxtext_utils.get_abstract_state(self.model, self.tx, self.config, self.rng, self.mesh, None) + + # Check that params are abstract + param_leaves = jax.tree_util.tree_leaves(abstract_state.params) + self.assertTrue(all(isinstance(leaf, jax.ShapeDtypeStruct) for leaf in param_leaves)) + + # Check that opt_state is abstract + opt_state_leaves = jax.tree_util.tree_leaves(abstract_state.opt_state) + self.assertTrue(all(isinstance(leaf, jax.ShapeDtypeStruct) for leaf in opt_state_leaves)) + + if __name__ == "__main__": unittest.main() diff --git a/tests/unit/sharding_compare_test.py b/tests/unit/sharding_compare_test.py index d4a5ef50a7..8d2d7bc7fb 100644 --- a/tests/unit/sharding_compare_test.py +++ b/tests/unit/sharding_compare_test.py @@ -1,4 +1,4 @@ -# Copyright 2023–2025 Google LLC +# Copyright 2023–2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,12 +18,21 @@ import json import os import pytest +import jax +import jax.numpy as jnp +# import optax +from MaxText.globals import MAXTEXT_PKG_DIR from MaxText.train_compile import get_shaped_inputs, get_topology_mesh, validate_config from MaxText import pyconfig +from MaxText import maxtext_utils +from MaxText.layers import models +from MaxText.layers import quantizations +from MaxText import optimizers -from tests.utils.sharding_dump import named_shardings_to_json, load_named_sharding_json, TEST_CASES -from tests.utils.test_helpers import get_test_config_path +from tests.utils.sharding_dump import load_json, TEST_CASES, named_shardings_to_json, partition_specs_to_json + +Transformer = models.transformer_as_linen def compute_checksum(d: dict) -> str: @@ -37,7 +46,7 @@ def compute_checksum(d: dict) -> str: return checksum -def compare_named_sharding_jsons(json1: dict, model1_name: str, json2: dict, model2_name: str) -> bool: +def compare_sharding_jsons(json1: dict, model1_name: str, json2: dict, model2_name: str) -> bool: """Compare two json files and print the differences if any.""" keys1 = set(json1.keys()) keys2 = set(json2.keys()) @@ -46,66 +55,210 @@ def compare_named_sharding_jsons(json1: dict, model1_name: str, json2: dict, mod only_in_2 = keys2 - keys1 shared_keys = keys1 & keys2 + has_diff = False + if only_in_1: print(f"Keys only in {model1_name}:") for k in sorted(only_in_1): print(f" {k}") + has_diff = True if only_in_2: print(f"Keys only in {model2_name}:") for k in sorted(only_in_2): print(f" {k}") + has_diff = True for key in sorted(shared_keys): entry1 = json1[key] entry2 = json2[key] - mesh1 = entry1.get("mesh", {}) - mesh2 = entry2.get("mesh", {}) - spec1 = entry1.get("partition_spec", []) - spec2 = entry2.get("partition_spec", []) + if isinstance(entry1, dict) and isinstance(entry2, dict): + mesh1 = entry1.get("mesh", {}) + mesh2 = entry2.get("mesh", {}) + + spec1 = entry1.get("partition_spec", []) + spec2 = entry2.get("partition_spec", []) + + shape1 = entry1.get("shape") + shape2 = entry2.get("shape") + + if mesh1 != mesh2: + print(f"\nMesh mismatch at '{key}':") + print(f" {model1_name}: {mesh1}") + print(f" {model2_name}: {mesh2}") + has_diff = True + + if spec1 != spec2: + print(f"\nPartitionSpec mismatch at '{key}':") + print(f" {model1_name}: {spec1}") + print(f" {model2_name}: {spec2}") + has_diff = True - if mesh1 != mesh2: - print(f"\nMesh mismatch at '{key}':") - print(f" mesh1: {mesh1}") - print(f" mesh2: {mesh2}") + if shape1 != shape2: + print(f"\nShape mismatch at '{key}':") + print(f" {model1_name}: {shape1}") + print(f" {model2_name}: {shape2}") + has_diff = True - if spec1 != spec2: - print(f"\nPartitionSpec mismatch at '{key}':") - print(f" spec1: {spec1}") - print(f" spec2: {spec2}") + else: + print(f"\nFormat mismatch at '{key}':") + print(f" {model1_name} type: {type(entry1)}") + print(f" {model2_name} type: {type(entry2)}") + has_diff = True - return not only_in_1 and not only_in_2 and all(json1[k] == json2[k] for k in shared_keys) + return has_diff @pytest.mark.parametrize("model_name, topology, num_slice", TEST_CASES) def test_sharding_dump_for_model(model_name: str, topology: str, num_slice: str) -> None: - """Test if the sharding of new model implementation is as expected.""" + """ + Test sharding configurations from train_compile.get_shaped_inputs. + This test verifies that the sharding configurations for various models and topologies remain consistent with golden files. + """ params = [ "/deps/MaxText/tests/unit/sharding_compare_test", - get_test_config_path(), + os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"), f"compile_topology={topology}", f"compile_topology_num_slices={num_slice}", f"model_name={model_name}", ] - json_path = f"sharding_info/" f"{model_name}/" f"{topology}/" f"slice_{num_slice}/named_shardings.json" - if not os.path.exists(json_path): + root_dir = "tests/utils/sharding_info" + base_path = os.path.join(root_dir, model_name, topology, f"slice_{num_slice}") + + named_json_path = os.path.join(base_path, "named_shardings.json") + logical_json_path = os.path.join(base_path, "logical_shardings.json") + + if not os.path.exists(named_json_path): + pytest.skip(f"Missing named_shardings.json for {model_name} {topology} slice {num_slice}") + return + if not os.path.exists(logical_json_path): + pytest.skip(f"Missing logical_shardings.json for {model_name} {topology} slice {num_slice}") return config = pyconfig.initialize(params) validate_config(config) topology_mesh = get_topology_mesh(config) - _, _, state_mesh_shardings, _, _ = get_shaped_inputs(topology_mesh, config) - actual_json = named_shardings_to_json(state_mesh_shardings) - expected_json = load_named_sharding_json(json_path) + shaped_train_args, _, state_mesh_shardings, logical_shardings, _ = get_shaped_inputs(topology_mesh, config) + + error_messages = [] + + # 1. Compare Named Shardings + actual_named = named_shardings_to_json(state_mesh_shardings, shaped_train_args[0]) + expected_named = load_json(named_json_path) + # calculate checksum + actual_named_sum = compute_checksum(actual_named) + expected_named_sum = compute_checksum(expected_named) + named_match = actual_named_sum == expected_named_sum + + if not named_match: + print(f"\n[FAIL] Physical Sharding Mismatch: {model_name} {topology} slice {num_slice}", flush=True) + compare_sharding_jsons(expected_named, "Expected (Physical)", actual_named, "Actual (Physical)") + error_messages.append(f" Physical sharding mismatch for {model_name} on {topology} slice {num_slice}") + + # 2. Compare Logical Shardings + actual_logical = partition_specs_to_json(logical_shardings, shaped_train_args[0]) + expected_logical = load_json(logical_json_path) + # calculate checksum + actual_logical_sum = compute_checksum(actual_logical) + expected_logical_sum = compute_checksum(expected_logical) + logical_match = actual_logical_sum == expected_logical_sum + + if not logical_match: + print(f"\n[FAIL] Logical Sharding Mismatch: {model_name} {topology} slice {num_slice}", flush=True) + compare_sharding_jsons(expected_logical, "Expected (Logical)", actual_logical, "Actual (Logical)") + error_messages.append(f"Logical sharding mismatch for {model_name} on {topology} slice {num_slice}") + + assert not error_messages, "\n".join(error_messages) + + +@pytest.fixture( + scope="module", + params=[pytest.param(case, id=f"{case[0]}-{case[1]}-{case[2]}") for case in TEST_CASES], +) +def abstract_state_and_shardings(request): + """Pytest fixture to set up model, config, and generate abstract state once per test case.""" + model_name, topology, num_slice = request.param + print(f"Testing model: {model_name}, topology: {topology}, num_slices: {num_slice}", flush=True) + params = [ + "/deps/MaxText/tests/unit/sharding_compare_test", + os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"), + f"compile_topology={topology}", + f"compile_topology_num_slices={num_slice}", + f"model_name={model_name}", + "weight_dtype=float32", + ] + config = pyconfig.initialize(params) + validate_config(config) + + topology_mesh = get_topology_mesh(config) + quant = quantizations.configure_quantization(config) + model = Transformer(config, mesh=topology_mesh, quant=quant) + + learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config) + # tx = optax.adam(learning_rate=learning_rate_schedule) + tx = optimizers.get_optimizer(config, learning_rate_schedule) + rng = jax.random.PRNGKey(0) + + # Get abstract state and physical shardings from maxtext_utils + abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state( + model, tx, config, rng, topology_mesh, is_training=True + ) + + # Get logical shardings from maxtext_utils + logical_shardings = maxtext_utils.get_logical_annotations(model, tx, config, rng, topology_mesh, is_training=True) + + return model_name, topology, num_slice, abstract_state, state_mesh_shardings, logical_shardings + + +class TestGetAbstractState: + """Test class for get_abstract_state function and sharding comparison.""" + + def test_get_abstract_state_sharding(self, abstract_state_and_shardings): # pylint: disable=redefined-outer-name + """Tests that get_abstract_state returns a state with the correct abstract structure and compares sharding.""" + + model_name, topology, num_slice, abstract_state, state_mesh_shardings, logical_shardings = ( + abstract_state_and_shardings + ) + + assert hasattr(abstract_state, "params") + assert hasattr(abstract_state, "opt_state") + param_leaf = jax.tree_util.tree_leaves(abstract_state.params)[0] + assert isinstance(param_leaf, jax.ShapeDtypeStruct) + assert param_leaf.dtype == jnp.float32 + + root_dir = "tests/utils/sharding_info" # Or your target directory + base_path = os.path.join(root_dir, model_name, topology, f"slice_{num_slice}") + os.makedirs(base_path, exist_ok=True) # Ensure directory exists for saving actual + + error_messages = [] + + # 1. Compare Physical/Named Shardings + named_json_path = os.path.join(base_path, "named_shardings.json") + if not os.path.exists(named_json_path): + pytest.skip(f"Missing named_shardings.json for {model_name} {topology} slice {num_slice}") + return + + # Use state_mesh_shardings from the fixture + actual_named = named_shardings_to_json(state_mesh_shardings, abstract_state) + expected_named = load_json(named_json_path) + + if compare_sharding_jsons(expected_named, "Expected (Physical)", actual_named, "Actual (Physical)"): + error_messages.append(f"Physical sharding mismatch for {model_name} on {topology} slice {num_slice}") + + # 2. Compare Logical Shardings + logical_json_path = os.path.join(base_path, "logical_shardings.json") + if not os.path.exists(logical_json_path): + pytest.skip(f"Missing logical_shardings.json for {model_name} {topology} slice {num_slice}") + return - actual_checksum = compute_checksum(actual_json) - expected_checksum2 = compute_checksum(expected_json) - result = actual_checksum == expected_checksum2 + # Use logical_shardings from the fixture + actual_logical = partition_specs_to_json(logical_shardings, abstract_state) + expected_logical = load_json(logical_json_path) - if not result: - compare_named_sharding_jsons(expected_json, f"expected_{model_name}", actual_json, f"actual_{model_name}") + if compare_sharding_jsons(expected_logical, "Expected (Logical)", actual_logical, "Actual (Logical)"): + error_messages.append(f"Logical sharding mismatch for {model_name} on {topology} slice {num_slice}") - assert result is True + assert not error_messages, "\n".join(error_messages) diff --git a/tests/utils/run_sharding_dump.py b/tests/utils/run_sharding_dump.py index 323aec7198..70b8e7bc1b 100644 --- a/tests/utils/run_sharding_dump.py +++ b/tests/utils/run_sharding_dump.py @@ -12,18 +12,52 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" Run script to dump sharding of various combination of model and topology. """ +"""Run script to dump sharding of various combination of model and topology. + +This script is a utility to generate and save the sharding configurations +(both physical and logical) for various model and hardware topology combinations. +These saved configurations act as "golden" files for regression testing. + +There are two primary ways to use the script: + +1. Generate Sharding for All Predefined Test Cases +---------------------------------------------------- +Run the script without any command-line arguments to iterate through all test +cases defined in `tests.utils.sharding_dump.TEST_CASES`. It will skip any +combination for which the output files already exist. + +Command: + python3 -m tests.utils.run_sharding_dump + +2. Generate Sharding for a Single, Specific Case +------------------------------------------------- +Provide the `model_name`, `topology`, and `num_slice` as command-line arguments +to generate sharding information for a single configuration. You must provide +all three arguments. + +Command: + python3 -m tests.utils.run_sharding_dump --model_name --topology --num_slice + +Example: + python3 -m tests.utils.run_sharding_dump --model_name gemma-7b --topology v5p-256 --num_slice 1 + +""" from typing import Sequence -from MaxText.globals import MAXTEXT_REPO_ROOT +from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_REPO_ROOT from tests.utils.sharding_dump import TEST_CASES -from tests.utils.test_helpers import get_test_config_path import os import subprocess +from absl import app, flags +from pathlib import Path + +FLAGS = flags.FLAGS -from absl import app +flags.DEFINE_string("model_name", None, "Specific model name to dump.") +flags.DEFINE_string("topology", None, "Specific topology to dump.") +flags.DEFINE_string("num_slice", None, "Specific number of slices to dump.") def run_single_dump(model_name: str, topology: str, num_slice: str) -> None: @@ -33,31 +67,47 @@ def run_single_dump(model_name: str, topology: str, num_slice: str) -> None: "python3", "-m", "tests.utils.sharding_dump", - get_test_config_path(), + os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"), f"compile_topology={topology}", f"compile_topology_num_slices={num_slice}", f"model_name={model_name}", + "weight_dtype=float32", ], check=True, ) def main(argv: Sequence[str]) -> None: - """Generate sharding json files for every combination of model, topology and slices.""" - for model_name, topology, num_slice in TEST_CASES: - json_path = os.path.join( - MAXTEXT_REPO_ROOT, - "tests", - "utils", - "sharding_info", - model_name, - topology, - f"slice_{num_slice}", - "named_shardings.json", + """Generate json files for every combination of model, topology and slices.""" + if FLAGS.model_name and FLAGS.topology and FLAGS.num_slice: + cases_to_run = [(FLAGS.model_name, FLAGS.topology, FLAGS.num_slice)] + print( + "Running specific case from command line: " + f"Model={FLAGS.model_name}, Topology={FLAGS.topology}, NumSlice={FLAGS.num_slice}" ) - if os.path.exists(json_path): + elif FLAGS.model_name or FLAGS.topology or FLAGS.num_slice: + print("Error: To specify a single test case, --model_name, --topology, and --num_slice must all be provided.") + return + else: + cases_to_run = TEST_CASES + print(f"Running all {len(TEST_CASES)} predefined test cases.") + + total = len(cases_to_run) + for i, (model_name, topology, num_slice) in enumerate(cases_to_run): + print(f"\n[{i+1}/{total}] Processing: {model_name} | {topology} | Slice {num_slice}") + + base_path = Path(f"{MAXTEXT_REPO_ROOT}/tests/utils/sharding_info/{model_name}/" f"{topology}/slice_{num_slice}/") + json_path_named = base_path / "named_shardings.json" + json_path_logical = base_path / "logical_shardings.json" + + if json_path_named.exists() and json_path_logical.exists(): + print(" -> Sharding files already exist. Skipping.") continue - run_single_dump(model_name, topology, str(num_slice)) + + try: + run_single_dump(model_name, topology, str(num_slice)) + except subprocess.CalledProcessError: + print(f"!!! FAILED: {model_name} {topology} {num_slice}") if __name__ == "__main__": diff --git a/tests/utils/sharding_dump.py b/tests/utils/sharding_dump.py index c096c98136..ec7b98b752 100644 --- a/tests/utils/sharding_dump.py +++ b/tests/utils/sharding_dump.py @@ -1,4 +1,4 @@ -# Copyright 2023–2025 Google LLC +# Copyright 2023–2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,12 +21,14 @@ import json import itertools from pathlib import Path -from typing import List, Sequence, Union +from typing import List, Sequence, Union, Any import jax from absl import app from jax.tree_util import tree_flatten_with_path from jax.sharding import NamedSharding, PartitionSpec from MaxText import pyconfig +from MaxText import maxtext_utils +from MaxText import optimizers from MaxText.globals import MAXTEXT_REPO_ROOT from MaxText.train_compile import get_shaped_inputs, get_topology_mesh, validate_config from MaxText.layers import models @@ -42,16 +44,19 @@ # "llama3-8b", # "llama3-70b", # "llama3.1-8b", - "llama3.1-70b", - "llama3.1-405b", + # "llama3.1-70b", + # "llama3.1-405b", # "llama3.3-70b", # "mistral-7b", # "mixtral-8x7b", # "mixtral-8x22b", - # "deepseek2-16b", + "deepseek2-16b", # "deepseek2-236b", # "deepseek3-671b", + # "deepseek3-671b-2dfsdp", # "deepseek3-test", + # "deepseek3-tiny", + # "deepseek3.2-671b", # "gemma-7b", # "gemma-2b", # "gemma2-2b", @@ -60,18 +65,127 @@ # "gemma3-4b", # "gemma3-12b", # "gemma3-27b", - # "qwen3-0.6b", + "qwen3-0.6b", # "qwen3-4b", + # "qwen3-4b-thinking-2507", # "qwen3-8b", + # "qwen3-14b", + # "qwen3-32b", + # "qwen3-235b-a22b", + # "qwen3-30b-a3b", + # "qwen3-480b-a35b", + # "qwen3-next-80b-a3b", + # "qwen3-omni-30b-a3b", # "gpt3-175b", # "gpt3-22b", # "gpt3-6b", # "gpt3-52k", + "gpt-oss-20b", + # "gpt-oss-120b", # "llama4-17b-16e", # "llama4-17b-128e", ] TOPOLOGIES = [ + # "tpu7x-2", + # "tpu7x-8", + "tpu7x-16", + # "tpu7x-32", + # "tpu7x-64", + # "tpu7x-128", + # "tpu7x-256", + # "tpu7x-384", + # "tpu7x-512", + # "tpu7x-640", + # "tpu7x-768", + # "tpu7x-896", + # "tpu7x-1024", + # "tpu7x-1152", + # "tpu7x-1280", + # "tpu7x-1408", + # "tpu7x-1536", + # "tpu7x-1664", + # "tpu7x-1792", + # "tpu7x-1920", + # "tpu7x-2048", + # "tpu7x-2176", + # "tpu7x-2304", + # "tpu7x-2432", + # "tpu7x-2560", + # "tpu7x-2688", + # "tpu7x-2816", + # "tpu7x-2944", + # "tpu7x-3072", + # "tpu7x-3200", + # "tpu7x-3328", + # "tpu7x-3456", + # "tpu7x-3584", + # "tpu7x-3712", + # "tpu7x-3840", + # "tpu7x-3968", + # "tpu7x-4096", + # "tpu7x-4224", + # "tpu7x-4352", + # "tpu7x-4480", + # "tpu7x-4608", + # "tpu7x-4736", + # "tpu7x-4864", + # "tpu7x-4992", + # "tpu7x-5120", + # "tpu7x-5248", + # "tpu7x-5376", + # "tpu7x-5504", + # "tpu7x-5632", + # "tpu7x-5760", + # "tpu7x-5888", + # "tpu7x-6016", + # "tpu7x-6144", + # "tpu7x-6272", + # "tpu7x-6400", + # "tpu7x-6528", + # "tpu7x-6656", + # "tpu7x-6784", + # "tpu7x-6912", + # "tpu7x-7040", + # "tpu7x-7168", + # "tpu7x-7296", + # "tpu7x-7424", + # "tpu7x-7552", + # "tpu7x-7680", + # "tpu7x-7808", + # "tpu7x-7936", + # "tpu7x-8064", + # "tpu7x-8192", + # "tpu7x-8320", + # "tpu7x-8448", + # "tpu7x-8704", + # "tpu7x-8832", + # "tpu7x-8960", + # "tpu7x-9216", + # "tpu7x-9472", + # "tpu7x-9600", + # "tpu7x-9728", + # "tpu7x-9856", + # "tpu7x-9984", + # "tpu7x-10240", + # "tpu7x-10368", + # "tpu7x-10496", + # "tpu7x-10752", + # "tpu7x-10880", + # "tpu7x-11008", + # "tpu7x-11136", + # "tpu7x-11264", + # "tpu7x-11520", + # "tpu7x-11648", + # "tpu7x-11776", + # "tpu7x-11904", + # "tpu7x-12032", + # "tpu7x-12160", + # "tpu7x-12288", + # "tpu7x-13824", + # "tpu7x-16384", + # "tpu7x-17920", + # "tpu7x-18432", # "v6e-1", # "v6e-4", # "v6e-8", @@ -79,15 +193,15 @@ # "v6e-32", # "v6e-64", # "v6e-128", - "v6e-256", + # "v6e-256", # "v5e-1", # "v5e-4", # "v5e-8", - "v5e-16", + # "v5e-16", # "v5e-32", # "v5e-64", # "v5e-128", - "v5e-256", + # "v5e-256", # "v4-8", # "v4-16", # "v4-32", @@ -105,7 +219,7 @@ # "v5p-32", # "v5p-64", # "v5p-128", - "v5p-256", + # "v5p-256", # "v5p-384", # "v5p-512", # "v5p-640", @@ -199,7 +313,7 @@ # "a3" ] -SLICES = [1, 4, 8192] +SLICES = [1, 4] TEST_CASES = list(itertools.product(MODEL_NAMES, TOPOLOGIES, SLICES)) @@ -218,16 +332,20 @@ def convert(entry): return list(convert(e) for e in spec) -def named_shardings_to_json(train_state) -> dict[str, dict]: +def named_shardings_to_json(train_state, shape_tree) -> dict[str, dict]: """Extract NamedSharding instances from a trainstate and save to JSON file.""" named_shardings = {} flat_items = tree_flatten_with_path(train_state)[0] - for path, leaf in flat_items: - if isinstance(leaf, NamedSharding): - name = "/".join(str(p) for p in path) - mesh = leaf.mesh - spec = leaf.spec + flat_shapes, _ = tree_flatten_with_path(shape_tree) + + for (path_s, leaf_s), (_, leaf_sh) in zip(flat_items, flat_shapes): + if isinstance(leaf_s, NamedSharding): + name = "/".join(str(p) for p in path_s) + mesh = leaf_s.mesh + spec = leaf_s.spec + # Extract shape from the shape_tree leaf (likely a ShapeDtypeStruct) + shape = list(leaf_sh.shape) if hasattr(leaf_sh, "shape") else None named_shardings[name] = { "mesh": { @@ -235,21 +353,43 @@ def named_shardings_to_json(train_state) -> dict[str, dict]: "shape": dict(mesh.shape), }, "partition_spec": _json_spec(spec), + "shape": shape, } print(f"Got {len(named_shardings)} NamedSharding entries.") return named_shardings -def save_named_sharding_dict(output_path: str | Path, sharding_dict: dict) -> None: - """Save the sharding dict directly to a JSON file.""" +def partition_specs_to_json(logical_tree, shape_tree) -> dict[str, Any]: + """ + Extract PartitionSpecs (Logical) from the logical tree. + Leaf nodes are expected to be PartitionSpec (or None). + """ + logical_dict = {} + flat_items = tree_flatten_with_path(logical_tree)[0] + flat_shapes, _ = tree_flatten_with_path(shape_tree) + + for (path_l, leaf_l), (_, leaf_sh) in zip(flat_items, flat_shapes): + # leaf should be PartitionSpec or None + if isinstance(leaf_l, PartitionSpec) or leaf_l is None: + name = "/".join(str(p) for p in path_l) + # Extract shape + shape = list(leaf_sh.shape) if hasattr(leaf_sh, "shape") else None + + logical_dict[name] = {"partition_spec": _json_spec(leaf_l), "shape": shape} + print(f"Got {len(logical_dict)} Logical entries.") + return logical_dict + + +def save_json(output_path: str | Path, sharding_dict: dict) -> None: + """Save dict to a JSON file.""" output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(sharding_dict, f, indent=2) -def load_named_sharding_json(json_path: str | Path) -> dict: +def load_json(json_path: str | Path) -> dict: """Loads the named_shardings.json file into a plain Python dict.""" json_path = Path(json_path) with open(json_path, "r", encoding="utf-8") as f: @@ -267,26 +407,40 @@ def main(argv: Sequence[str]) -> None: config = pyconfig.initialize(argv) validate_config(config) - json_path = ( + base_path = Path( f"{MAXTEXT_REPO_ROOT}/tests/utils/sharding_info/{config.model_name}/" - f"{config.compile_topology}/" - f"slice_{config.compile_topology_num_slices}/" - f"named_shardings.json" + f"{config.compile_topology}/slice_{config.compile_topology_num_slices}/" ) + json_path_named = base_path / "named_shardings.json" + json_path_logical = base_path / "logical_shardings.json" try: topology_mesh = get_topology_mesh(config) - _, _, state_mesh_shardings, _, _ = get_shaped_inputs(topology_mesh, config) - except: # pylint: disable=bare-except - state_mesh_shardings = {} + learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config) + optimizers.get_optimizer(config, learning_rate_schedule) + shaped_train_args, _, state_mesh_shardings, logical_annotations, _ = get_shaped_inputs(topology_mesh, config) + except Exception as e: # pylint: disable=broad-except + print(f"Error generating inputs: {e}") + return - if state_mesh_shardings == {}: + if not state_mesh_shardings: + print("No shardings generated.") return - sharding_dict = named_shardings_to_json(state_mesh_shardings) - save_named_sharding_dict(json_path, sharding_dict) - load_named_sharding_json(json_path) - print(config.model_name, config.compile_topology) + # 1. Generate New Output + # Physical: Tree of NamedSharding + named_shardings = named_shardings_to_json(state_mesh_shardings, shaped_train_args[0]) + # Logical: Tree of PartitionSpec (direct from get_shaped_inputs) + logical_shardings = partition_specs_to_json(logical_annotations, shaped_train_args[0]) + + print(f"Got {len(named_shardings)} Physical entries and {len(logical_shardings)} Logical entries.") + + # 2. Save New Output (Overwrite) + print(f"\nSaving updated shardings to {base_path}...") + save_json(json_path_named, named_shardings) + save_json(json_path_logical, logical_shardings) + + print(f"Finished: {config.model_name} {config.compile_topology}") if __name__ == "__main__": diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..0d224005c5 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/named_shardings.json new file mode 100644 index 0000000000..ed09ed2037 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/named_shardings.json @@ -0,0 +1,4178 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..0d224005c5 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_4/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_4/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_4/named_shardings.json new file mode 100644 index 0000000000..a7fa362422 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_4/named_shardings.json @@ -0,0 +1,4178 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..0d224005c5 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_1/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_1/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_1/named_shardings.json new file mode 100644 index 0000000000..a7e781f9c3 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_1/named_shardings.json @@ -0,0 +1,4178 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..0d224005c5 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_4/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_4/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_4/named_shardings.json new file mode 100644 index 0000000000..19cd50adc3 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v5p-16/slice_4/named_shardings.json @@ -0,0 +1,4178 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..0d224005c5 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_1/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_1/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_1/named_shardings.json new file mode 100644 index 0000000000..ed09ed2037 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_1/named_shardings.json @@ -0,0 +1,4178 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..0d224005c5 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_4/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_4/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_4/named_shardings.json new file mode 100644 index 0000000000..a7fa362422 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/v6e-16/slice_4/named_shardings.json @@ -0,0 +1,4178 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + null, + [ + "fsdp", + "sequence", + "tensor_transpose", + "context", + "expert" + ] + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null, + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + null, + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..1b90463c89 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_1/logical_shardings.json @@ -0,0 +1,1490 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_1/named_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_1/named_shardings.json new file mode 100644 index 0000000000..6a4eb12a10 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_1/named_shardings.json @@ -0,0 +1,6065 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..1b90463c89 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_4/logical_shardings.json @@ -0,0 +1,1490 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_4/named_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_4/named_shardings.json new file mode 100644 index 0000000000..fffa91ebe5 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/tpu7x-16/slice_4/named_shardings.json @@ -0,0 +1,6065 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..1b90463c89 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_1/logical_shardings.json @@ -0,0 +1,1490 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_1/named_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_1/named_shardings.json new file mode 100644 index 0000000000..a291ec09db --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_1/named_shardings.json @@ -0,0 +1,6065 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..1b90463c89 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_4/logical_shardings.json @@ -0,0 +1,1490 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_4/named_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_4/named_shardings.json new file mode 100644 index 0000000000..1e20b637fe --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v5p-16/slice_4/named_shardings.json @@ -0,0 +1,6065 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..1b90463c89 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_1/logical_shardings.json @@ -0,0 +1,1490 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_1/named_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_1/named_shardings.json new file mode 100644 index 0000000000..6a4eb12a10 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_1/named_shardings.json @@ -0,0 +1,6065 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..1b90463c89 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_4/logical_shardings.json @@ -0,0 +1,1490 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "partition_spec": [ + "embed", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "partition_spec": [ + "q_heads", + "layers", + "kv" + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "partition_spec": [ + "kv_heads", + "layers", + "kv_head_dim" + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "partition_spec": [ + null, + "layers" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "partition_spec": [ + "exp", + "layers", + "embed_no_exp", + "mlp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_mlp" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "partition_spec": [ + "exp", + "layers", + "mlp", + "embed_no_exp" + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "partition_spec": [ + "exp", + "layers", + "activation_embed" + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed", + "vocab" + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_4/named_shardings.json b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_4/named_shardings.json new file mode 100644 index 0000000000..fffa91ebe5 --- /dev/null +++ b/tests/utils/sharding_info/gpt-oss-20b/v6e-16/slice_4/named_shardings.json @@ -0,0 +1,6065 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_0']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['key']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null, + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 64, + 12, + 64, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 64, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 64, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['sinks']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 64, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + "stage", + null + ], + "shape": [ + 8, + 12, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssAttention']/['value']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + null + ], + "shape": [ + 2880, + 12, + 8, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + null, + "stage" + ], + "shape": [ + 32, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "tensor_transpose", + "context", + "expert" + ], + "stage", + null + ], + "shape": [ + 2880, + 12, + 32 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_0_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ], + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wi_1_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose", + "tensor_sequence" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "fsdp_transpose", + "tensor", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "sequence", + "tensor_transpose", + "context" + ] + ], + "shape": [ + 32, + 12, + 2880, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['GptOssMlp']/['wo_bias']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + "expert", + "stage", + [ + "tensor", + "tensor_transpose" + ] + ], + "shape": [ + 32, + 12, + 2880 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['layers_1']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 2880, + 12 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ], + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ] + ], + "shape": [ + 2880, + 201088 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose", + "tensor_sequence", + "autoregressive" + ], + [ + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "expert" + ] + ], + "shape": [ + 201088, + 2880 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_8192/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_8192/named_shardings.json deleted file mode 100644 index ec82c397ec..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_8192/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5e-256/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5e-256/slice_1/named_shardings.json deleted file mode 100644 index cb1aafab49..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5e-256/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5e-256/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5e-256/slice_4/named_shardings.json deleted file mode 100644 index 0d58998984..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5e-256/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_1/named_shardings.json deleted file mode 100644 index 610f5d7016..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_4/named_shardings.json deleted file mode 100644 index 09d3011378..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_8192/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_8192/named_shardings.json deleted file mode 100644 index 523c1774ad..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5p-16/slice_8192/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5p-256/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5p-256/slice_1/named_shardings.json deleted file mode 100644 index cf39bdb9e2..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5p-256/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5p-256/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v5p-256/slice_4/named_shardings.json deleted file mode 100644 index ef5e7c9681..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v5p-256/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_1/named_shardings.json deleted file mode 100644 index 31eb26c795..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_8192/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_8192/named_shardings.json deleted file mode 100644 index ec82c397ec..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_8192/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v6e-256/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v6e-256/slice_1/named_shardings.json deleted file mode 100644 index cb1aafab49..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v6e-256/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v6e-256/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-405b/v6e-256/slice_4/named_shardings.json deleted file mode 100644 index 0d58998984..0000000000 --- a/tests/utils/sharding_info/llama3.1-405b/v6e-256/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_4/named_shardings.json deleted file mode 100644 index 733efdf3e5..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_8192/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_8192/named_shardings.json deleted file mode 100644 index ec82c397ec..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_8192/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5e-256/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5e-256/slice_1/named_shardings.json deleted file mode 100644 index cb1aafab49..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5e-256/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5e-256/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5e-256/slice_4/named_shardings.json deleted file mode 100644 index 0d58998984..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5e-256/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_8192/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_8192/named_shardings.json deleted file mode 100644 index 523c1774ad..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_8192/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 8, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5p-256/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5p-256/slice_1/named_shardings.json deleted file mode 100644 index cf39bdb9e2..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5p-256/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5p-256/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v5p-256/slice_4/named_shardings.json deleted file mode 100644 index ef5e7c9681..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v5p-256/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 128, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_1/named_shardings.json deleted file mode 100644 index 31eb26c795..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_4/named_shardings.json deleted file mode 100644 index 733efdf3e5..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_8192/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_8192/named_shardings.json deleted file mode 100644 index ec82c397ec..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v6e-16/slice_8192/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 8192, - "stage": 1, - "fsdp": 16, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v6e-256/slice_1/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v6e-256/slice_1/named_shardings.json deleted file mode 100644 index cb1aafab49..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v6e-256/slice_1/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 1, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v6e-256/slice_4/named_shardings.json b/tests/utils/sharding_info/llama3.1-70b/v6e-256/slice_4/named_shardings.json deleted file mode 100644 index 0d58998984..0000000000 --- a/tests/utils/sharding_info/llama3.1-70b/v6e-256/slice_4/named_shardings.json +++ /dev/null @@ -1,1760 +0,0 @@ -{ - ".step": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".params/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".params/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ], - "stage", - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp_transpose", - "tensor", - "tensor_sequence", - "autoregressive" - ], - "stage", - [ - "fsdp", - "sequence", - "tensor_transpose", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence" - ], - "stage" - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - "stage", - null, - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - null - ] - }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ] - ] - }, - ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [ - [ - "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" - ], - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ] - ] - }, - ".opt_state/[2]/.count": { - "mesh": { - "axis_names": [ - "data", - "stage", - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "context_autoregressive", - "tensor", - "tensor_transpose", - "tensor_sequence", - "expert", - "autoregressive" - ], - "shape": { - "data": 4, - "stage": 1, - "fsdp": 256, - "fsdp_transpose": 1, - "sequence": 1, - "context": 1, - "context_autoregressive": 1, - "tensor": 1, - "tensor_transpose": 1, - "tensor_sequence": 1, - "expert": 1, - "autoregressive": 1 - } - }, - "partition_spec": [] - } -} \ No newline at end of file diff --git a/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..487e9bb959 --- /dev/null +++ b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_1/logical_shardings.json @@ -0,0 +1,464 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_1/named_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_1/named_shardings.json similarity index 88% rename from tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_1/named_shardings.json rename to tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_1/named_shardings.json index 31eb26c795..0ad9713479 100644 --- a/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_1/named_shardings.json +++ b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_1/named_shardings.json @@ -30,7 +30,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".params/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -66,9 +67,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -117,6 +120,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -165,6 +173,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -213,6 +226,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -249,10 +267,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -289,10 +310,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -342,6 +366,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -391,6 +464,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -440,9 +519,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -474,24 +559,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -530,12 +609,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".params/['params']/['token_embedder']/['embedding']": { @@ -583,6 +670,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.count": { @@ -616,7 +707,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -652,9 +744,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -703,6 +797,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -751,6 +850,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -799,6 +903,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -835,10 +944,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -875,10 +987,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -928,6 +1043,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -977,6 +1141,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1026,9 +1196,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1060,24 +1236,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1116,12 +1286,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { @@ -1169,6 +1347,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { @@ -1205,9 +1387,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -1256,6 +1440,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -1304,6 +1493,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -1352,6 +1546,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -1388,10 +1587,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -1428,10 +1630,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -1481,6 +1686,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -1530,6 +1784,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1579,9 +1839,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1613,24 +1879,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1669,12 +1929,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { @@ -1722,6 +1990,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[2]/.count": { @@ -1755,6 +2027,7 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] } } \ No newline at end of file diff --git a/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..487e9bb959 --- /dev/null +++ b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_4/logical_shardings.json @@ -0,0 +1,464 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_4/named_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_4/named_shardings.json similarity index 88% rename from tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_4/named_shardings.json rename to tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_4/named_shardings.json index 733efdf3e5..8e13360273 100644 --- a/tests/utils/sharding_info/llama3.1-405b/v6e-16/slice_4/named_shardings.json +++ b/tests/utils/sharding_info/qwen3-0.6b/tpu7x-16/slice_4/named_shardings.json @@ -30,7 +30,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".params/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -66,9 +67,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -117,6 +120,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -165,6 +173,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -213,6 +226,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -249,10 +267,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -289,10 +310,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -342,6 +366,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -391,6 +464,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -440,9 +519,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -474,24 +559,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -530,12 +609,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".params/['params']/['token_embedder']/['embedding']": { @@ -583,6 +670,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.count": { @@ -616,7 +707,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -652,9 +744,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -703,6 +797,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -751,6 +850,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -799,6 +903,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -835,10 +944,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -875,10 +987,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -928,6 +1043,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -977,6 +1141,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1026,9 +1196,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1060,24 +1236,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1116,12 +1286,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { @@ -1169,6 +1347,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { @@ -1205,9 +1387,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -1256,6 +1440,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -1304,6 +1493,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -1352,6 +1546,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -1388,10 +1587,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -1428,10 +1630,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -1481,6 +1686,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -1530,6 +1784,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1579,9 +1839,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1613,24 +1879,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1669,12 +1929,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { @@ -1722,6 +1990,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[2]/.count": { @@ -1755,6 +2027,7 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] } } \ No newline at end of file diff --git a/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..487e9bb959 --- /dev/null +++ b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_1/logical_shardings.json @@ -0,0 +1,464 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_1/named_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_1/named_shardings.json similarity index 88% rename from tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_1/named_shardings.json rename to tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_1/named_shardings.json index 610f5d7016..40d1315185 100644 --- a/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_1/named_shardings.json +++ b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_1/named_shardings.json @@ -30,7 +30,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".params/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -66,9 +67,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -117,6 +120,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -165,6 +173,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -213,6 +226,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -249,10 +267,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -289,10 +310,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -342,6 +366,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -391,6 +464,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -440,9 +519,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -474,24 +559,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -530,12 +609,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".params/['params']/['token_embedder']/['embedding']": { @@ -583,6 +670,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.count": { @@ -616,7 +707,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -652,9 +744,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -703,6 +797,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -751,6 +850,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -799,6 +903,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -835,10 +944,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -875,10 +987,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -928,6 +1043,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -977,6 +1141,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1026,9 +1196,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1060,24 +1236,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1116,12 +1286,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { @@ -1169,6 +1347,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { @@ -1205,9 +1387,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -1256,6 +1440,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -1304,6 +1493,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -1352,6 +1546,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -1388,10 +1587,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -1428,10 +1630,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -1481,6 +1686,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -1530,6 +1784,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1579,9 +1839,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1613,24 +1879,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1669,12 +1929,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { @@ -1722,6 +1990,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[2]/.count": { @@ -1755,6 +2027,7 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] } } \ No newline at end of file diff --git a/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..487e9bb959 --- /dev/null +++ b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_4/logical_shardings.json @@ -0,0 +1,464 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_4/named_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_4/named_shardings.json similarity index 88% rename from tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_4/named_shardings.json rename to tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_4/named_shardings.json index 09d3011378..5fc1a68eed 100644 --- a/tests/utils/sharding_info/llama3.1-70b/v5p-16/slice_4/named_shardings.json +++ b/tests/utils/sharding_info/qwen3-0.6b/v5p-16/slice_4/named_shardings.json @@ -30,7 +30,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".params/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -66,9 +67,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -117,6 +120,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -165,6 +173,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -213,6 +226,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -249,10 +267,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -289,10 +310,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -342,6 +366,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -391,6 +464,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -440,9 +519,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -474,24 +559,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -530,12 +609,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".params/['params']/['token_embedder']/['embedding']": { @@ -583,6 +670,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.count": { @@ -616,7 +707,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -652,9 +744,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -703,6 +797,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -751,6 +850,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -799,6 +903,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -835,10 +944,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -875,10 +987,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -928,6 +1043,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -977,6 +1141,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1026,9 +1196,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1060,24 +1236,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1116,12 +1286,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { @@ -1169,6 +1347,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { @@ -1205,9 +1387,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -1256,6 +1440,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -1304,6 +1493,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -1352,6 +1546,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -1388,10 +1587,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -1428,10 +1630,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -1481,6 +1686,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -1530,6 +1784,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1579,9 +1839,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1613,24 +1879,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1669,12 +1929,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { @@ -1722,6 +1990,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[2]/.count": { @@ -1755,6 +2027,7 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] } } \ No newline at end of file diff --git a/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_1/logical_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_1/logical_shardings.json new file mode 100644 index 0000000000..487e9bb959 --- /dev/null +++ b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_1/logical_shardings.json @@ -0,0 +1,464 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_1/named_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_1/named_shardings.json similarity index 88% rename from tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_1/named_shardings.json rename to tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_1/named_shardings.json index 31eb26c795..0ad9713479 100644 --- a/tests/utils/sharding_info/llama3.1-70b/v5e-16/slice_1/named_shardings.json +++ b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_1/named_shardings.json @@ -30,7 +30,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".params/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -66,9 +67,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -117,6 +120,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -165,6 +173,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -213,6 +226,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -249,10 +267,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -289,10 +310,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -342,6 +366,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -391,6 +464,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -440,9 +519,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -474,24 +559,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -530,12 +609,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".params/['params']/['token_embedder']/['embedding']": { @@ -583,6 +670,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.count": { @@ -616,7 +707,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -652,9 +744,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -703,6 +797,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -751,6 +850,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -799,6 +903,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -835,10 +944,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -875,10 +987,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -928,6 +1043,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -977,6 +1141,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1026,9 +1196,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1060,24 +1236,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1116,12 +1286,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { @@ -1169,6 +1347,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { @@ -1205,9 +1387,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -1256,6 +1440,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -1304,6 +1493,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -1352,6 +1546,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -1388,10 +1587,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -1428,10 +1630,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -1481,6 +1686,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 1, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -1530,6 +1784,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1579,9 +1839,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1613,24 +1879,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1669,12 +1929,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { @@ -1722,6 +1990,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[2]/.count": { @@ -1755,6 +2027,7 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] } } \ No newline at end of file diff --git a/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_4/logical_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_4/logical_shardings.json new file mode 100644 index 0000000000..487e9bb959 --- /dev/null +++ b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_4/logical_shardings.json @@ -0,0 +1,464 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "mlp" + ], + "shape": [ + 1024, + 28, + 3072 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "layers", + "embed" + ], + "shape": [ + 3072, + 28, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 1024, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "layers", + "kv", + "embed" + ], + "shape": [ + 16, + 28, + 128, + 1024 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "q_heads", + "kv" + ], + "shape": [ + 1024, + 28, + 16, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { + "partition_spec": [ + "norm", + "layers" + ], + "shape": [ + 128, + 28 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + "partition_spec": [ + "embed", + "layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed" + ], + "shape": [ + 151936, + 1024 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_4/named_shardings.json b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_4/named_shardings.json similarity index 88% rename from tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_4/named_shardings.json rename to tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_4/named_shardings.json index 733efdf3e5..8e13360273 100644 --- a/tests/utils/sharding_info/llama3.1-405b/v5e-16/slice_4/named_shardings.json +++ b/tests/utils/sharding_info/qwen3-0.6b/v6e-16/slice_4/named_shardings.json @@ -30,7 +30,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".params/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -66,9 +67,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -117,6 +120,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -165,6 +173,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".params/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -213,6 +226,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -249,10 +267,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -289,10 +310,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -342,6 +366,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".params/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -391,6 +464,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".params/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -440,9 +519,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -474,24 +559,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + ".params/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -530,12 +609,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".params/['params']/['token_embedder']/['embedding']": { @@ -583,6 +670,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.count": { @@ -616,7 +707,8 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] }, ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { "mesh": { @@ -652,9 +744,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -703,6 +797,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -751,6 +850,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -799,6 +903,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -835,10 +944,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -875,10 +987,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -928,6 +1043,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -977,6 +1141,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1026,9 +1196,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1060,24 +1236,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.mu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1116,12 +1286,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { @@ -1169,6 +1347,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { @@ -1205,9 +1387,11 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ] + ], + "shape": [ + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_0']/['kernel']": { @@ -1256,6 +1440,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wi_1']/['kernel']": { @@ -1304,6 +1493,11 @@ "tensor_sequence", "autoregressive" ] + ], + "shape": [ + 1024, + 28, + 3072 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['mlp']/['wo']/['kernel']": { @@ -1352,6 +1546,11 @@ "context", "expert" ] + ], + "shape": [ + 3072, + 28, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['post_self_attention_layer_norm']/['scale']": { @@ -1388,10 +1587,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['pre_self_attention_layer_norm']/['scale']": { @@ -1428,10 +1630,13 @@ "partition_spec": [ [ "tensor", - "tensor_transpose", - "tensor_sequence" + "tensor_transpose" ], "stage" + ], + "shape": [ + 1024, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key']/['kernel']": { @@ -1481,6 +1686,55 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 8, + 128 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['key_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "stage", + "fsdp", + "fsdp_transpose", + "sequence", + "context", + "context_autoregressive", + "tensor", + "tensor_transpose", + "tensor_sequence", + "expert", + "autoregressive" + ], + "shape": { + "data": 4, + "stage": 1, + "fsdp": 16, + "fsdp_transpose": 1, + "sequence": 1, + "context": 1, + "context_autoregressive": 1, + "tensor": 1, + "tensor_transpose": 1, + "tensor_sequence": 1, + "expert": 1, + "autoregressive": 1 + } + }, + "partition_spec": [ + [ + "tensor", + "tensor_transpose" + ], + "stage" + ], + "shape": [ + 128, + 28 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['out']/['kernel']": { @@ -1530,6 +1784,12 @@ "context", "expert" ] + ], + "shape": [ + 16, + 28, + 128, + 1024 ] }, ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query']/['kernel']": { @@ -1579,9 +1839,15 @@ "autoregressive" ], null + ], + "shape": [ + 1024, + 28, + 16, + 128 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['query_norm']/['scale']": { "mesh": { "axis_names": [ "data", @@ -1613,24 +1879,18 @@ } }, "partition_spec": [ - [ - "fsdp", - "fsdp_transpose", - "sequence", - "context", - "expert" - ], - "stage", [ "tensor", - "tensor_transpose", - "tensor_sequence", - "autoregressive" + "tensor_transpose" ], - null + "stage" + ], + "shape": [ + 128, + 28 ] }, - ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + ".opt_state/[0]/.nu/['params']/['decoder']/['layers']/['self_attention']/['value']/['kernel']": { "mesh": { "axis_names": [ "data", @@ -1669,12 +1929,20 @@ "context", "expert" ], + "stage", [ "tensor", "tensor_transpose", "tensor_sequence", "autoregressive" - ] + ], + null + ], + "shape": [ + 1024, + 28, + 8, + 128 ] }, ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { @@ -1722,6 +1990,10 @@ "context", "expert" ] + ], + "shape": [ + 151936, + 1024 ] }, ".opt_state/[2]/.count": { @@ -1755,6 +2027,7 @@ "autoregressive": 1 } }, - "partition_spec": [] + "partition_spec": [], + "shape": [] } } \ No newline at end of file