Source code for oumi.cli.synth

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime
from pathlib import Path
from typing import Annotated

import typer
from rich.table import Table

import oumi.cli.cli_utils as cli_utils
from oumi.utils.logging import logger

_MAX_TABLE_ROWS = 1
_MAX_REPRESENTATION_LENGTH = 200
_TABLE_COLUMNS_TO_DISPLAY = 6


[docs] def synth( ctx: typer.Context, config: Annotated[ str, typer.Option( *cli_utils.CONFIG_FLAGS, help="Path to the configuration file for synthesis.", ), ], level: cli_utils.LOG_LEVEL_TYPE = None, ): """Synthesize a dataset. Args: ctx: The Typer context object. config: Path to the configuration file for synthesis. level: The logging level for the specified command. """ extra_args = cli_utils.parse_extra_cli_args(ctx) config = str(cli_utils.resolve_and_fetch_config(config)) with cli_utils.CONSOLE.status( "[green]Loading configuration...[/green]", spinner="dots" ): # Delayed imports from oumi import synthesize as oumi_synthesize from oumi.core.configs.synthesis_config import SynthesisConfig # End imports # Load configuration parsed_config: SynthesisConfig = SynthesisConfig.from_yaml_and_arg_list( config, extra_args, logger=logger ) parsed_config.finalize_and_validate() output_path = parsed_config.output_path if not output_path: cwd = Path.cwd() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = cwd / f"oumi_synth_results_{timestamp}.jsonl" if output_path.exists(): i = 1 while output_path.exists(): output_path = cwd / f"oumi_synth_results_{timestamp}_{i}.jsonl" i += 1 parsed_config.output_path = output_path.as_posix() # Run synthesis with cli_utils.CONSOLE.status( "[green]Synthesizing dataset...[/green]", spinner="dots" ): results = oumi_synthesize(parsed_config) if not results: cli_utils.CONSOLE.print( "No results found, please check your configuration and try again." "Report this issue at https://github.com/oumi-ai/oumi/issues" ) return # Display results table table = Table( title="Synthesis Results", title_style="bold magenta", show_edge=False, show_lines=True, ) columns = list(results[0].keys()) column_count = len(columns) additional_column = ( f"... and {column_count - _TABLE_COLUMNS_TO_DISPLAY + 1} more columns..." ) if column_count > _TABLE_COLUMNS_TO_DISPLAY: # Keep first N-1 columns and add the additional column columns = columns[: _TABLE_COLUMNS_TO_DISPLAY - 1] columns.append(additional_column) for column in columns: table.add_column(column, style="green") for i, result in enumerate(results[:_MAX_TABLE_ROWS]): # Show first 5 samples representations = [] for column in columns: if column == additional_column: representation = "..." else: representation = repr(result[column]) if len(representation) > _MAX_REPRESENTATION_LENGTH: representation = representation[:_MAX_REPRESENTATION_LENGTH] + "..." representations.append(representation) table.add_row(*representations) cli_utils.CONSOLE.print(table) if len(results) > _MAX_TABLE_ROWS: cli_utils.CONSOLE.print( f"... and {len(results) - _MAX_TABLE_ROWS} more samples" ) cli_utils.CONSOLE.print( f"\n[green]Successfully synthesized {len(results)} samples and saved to " f"{parsed_config.output_path}[/green]" ) cli_utils.CONSOLE.print( f"\n\n[green]To train a model, run: oumi train -c " f"path/to/your/train/config.yaml\n\n" f"If you included a 'conversation' chat attribute in your config, update the " f"config to use your new dataset:\n" f"data:\n" f" train:\n" f" datasets:\n" f' - dataset_name: "text_sft_jsonl"\n' f' dataset_path: "{parsed_config.output_path}"\n' f"[/green]" )