class DSPy::Teleprompt::DataHandler
Provides operations for large datasets during bootstrap and optimization
Data handling for optimization with efficient operations
def create_candidate_sets(num_sets, set_size, random_state: nil)
def create_candidate_sets(num_sets, set_size, random_state: nil) return Array.new(num_sets) { [] } if @examples.empty? if random_state srand(random_state) end candidate_sets = [] actual_set_size = [set_size, @examples.size].min num_sets.times do |i| # Use different random state for each set to ensure variety current_seed = random_state ? random_state + i : nil if current_seed srand(current_seed) end set_examples = @examples.sample(actual_set_size) candidate_sets << set_examples end candidate_sets end
def each_batch(batch_size)
def each_batch(batch_size) @examples.each_slice(batch_size) end
def initialize(examples)
def initialize(examples) @examples = examples end
def partition_by_success(successful_indices)
def partition_by_success(successful_indices) successful_examples = successful_indices.map { |i| @examples[i] if i < @examples.size }.compact failed_indices = (0...@examples.size).to_a - successful_indices failed_examples = failed_indices.map { |i| @examples[i] } [successful_examples, failed_examples] end
def sample(n, random_state: nil)
def sample(n, random_state: nil) return [] if @examples.empty? || n <= 0 # Handle case where n is larger than available examples actual_n = [n, @examples.size].min # Set random seed if provided if random_state srand(random_state) end @examples.sample(actual_n) end
def shuffle(random_state: nil)
def shuffle(random_state: nil) if random_state srand(random_state) end @examples.shuffle end
def statistics
def statistics { total_examples: @examples.size, example_types: @examples.map(&:class).uniq.map(&:name), memory_usage_estimate: @examples.size * 1000 # Rough estimate } end
def stratified_sample(n, stratify_column: nil)
def stratified_sample(n, stratify_column: nil) # For now, fall back to regular sampling (can be enhanced later) sample(n) end