module Linguist::Samples

def self.cache

Hash of serialized samples object, cached in memory
def self.cache
  @cache ||= load_samples
end

def self.data

Returns trained Classifier.

Public: Build Classifier from all samples.
def self.data
  db = {}
  db['extnames'] = {}
  db['interpreters'] = {}
  db['filenames'] = {}
  each do |sample|
    language_name = sample[:language]
    if sample[:extname]
      db['extnames'][language_name] ||= []
      if !db['extnames'][language_name].include?(sample[:extname])
        db['extnames'][language_name] << sample[:extname]
        db['extnames'][language_name].sort!
      end
    end
    if sample[:interpreter]
      db['interpreters'][language_name] ||= []
      if !db['interpreters'][language_name].include?(sample[:interpreter])
        db['interpreters'][language_name] << sample[:interpreter]
        db['interpreters'][language_name].sort!
      end
    end
    if sample[:filename]
      db['filenames'][language_name] ||= []
      db['filenames'][language_name] << sample[:filename]
      db['filenames'][language_name].sort!
    end
    data = File.read(sample[:path])
    Classifier.train!(db, language_name, data)
  end
  Classifier.finalize_train! db
  db['sha256'] = Linguist::SHA256.hexdigest(db)
  db
end

def self.each(&block)

Returns nothing.

&block - Yields Sample to block

Public: Iterate over each sample.
def self.each(&block)
  Dir.entries(ROOT).sort!.each do |category|
    next if category == '.' || category == '..'
    dirname = File.join(ROOT, category)
    Dir.entries(dirname).each do |filename|
      next if filename == '.' || filename == '..'
      if filename == 'filenames'
        Dir.entries(File.join(dirname, filename)).each do |subfilename|
          next if subfilename == '.' || subfilename == '..'
          yield({
            :path    => File.join(dirname, filename, subfilename),
            :language => category,
            :filename => subfilename
          })
        end
      else
        path = File.join(dirname, filename)
        extname = File.extname(filename)
        yield({
          :path     => path,
          :language => category,
          :interpreter => Shebang.interpreter(File.read(path)),
          :extname  => extname.empty? ? nil : extname
        })
      end
    end
  end
  nil
end

def self.load_samples

Hash of serialized samples object, uncached
def self.load_samples
  serializer = defined?(Yajl) ? Yajl : JSON
  data = serializer.load(File.read(PATH, encoding: 'utf-8'))
  # JSON serialization does not allow integer keys, we fix them here
  for lang in data['centroids'].keys
    fixed = data['centroids'][lang].to_a.map { |k,v| [k.to_i, v] }
    data['centroids'][lang] = Hash[fixed]
  end
  data
end