Source code for oumi.datasets.vision_language.vision_dpo_jsonlines

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import pandas as pd
from typing_extensions import override

from oumi.core.datasets import VisionLanguageDpoDataset
from oumi.core.registry import register_dataset
from oumi.utils.io_utils import load_jsonlines


[docs] @register_dataset("vision_dpo_jsonl") class VisionDpoJsonlinesDataset(VisionLanguageDpoDataset): """VisionDpoJsonlinesDataset for loading Vision-Language DPO data in Oumi format. This dataset class is designed to work with JSON Lines (.jsonl) files containing Vision-Language Direct Preference Optimization (DPO) data. It supports loading data either from a file or from a provided list of data samples. See `VisionLanguageDpoDataset` for more details. Example:: dataset = VisionDpoJsonlinesDataset( dataset_path="data/dataset_examples/vision_language_dpo_format.jsonl" ) """ default_dataset = "vision_dpo_jsonl" def __init__( self, *, dataset_name: Optional[str] = None, dataset_path: Optional[str] = None, data: Optional[list[dict]] = None, **kwargs, ): """Initialize the VisionDpoJsonlinesDataset. Args: dataset_name: Name of the dataset (for registry purposes). dataset_path: Path to the JSONL file containing vision DPO data. data: List of data samples to use instead of loading from file. **kwargs: Additional arguments passed to the parent class. Raises: ValueError: If neither dataset_path nor data is provided. """ if dataset_path is not None and data is not None: raise ValueError("Only one of dataset_path or data must be provided") if data is not None: rows = data elif dataset_path is not None: rows = load_jsonlines(dataset_path) else: raise ValueError("Either dataset_path or data must be provided") self._data = pd.DataFrame(rows) super().__init__(dataset_name=dataset_name, dataset_path=dataset_path, **kwargs) @override def _load_data(self) -> pd.DataFrame: """Load the data from the provided samples.""" # data is already loaded in the constructor, no need to load again return self._data