lib/multiwoven/integrations/source/sftp/client.rb



# frozen_string_literal: true

module Multiwoven::Integrations::Source
  module Sftp
    include Multiwoven::Integrations::Core
    class Client < SourceConnector
      def check_connection(connection_config)
        connection_config = connection_config.with_indifferent_access
        create_connection(connection_config)
        if @sftp.stat!(@remote_file_path)
          success_status
        else
          failure_status(nil)
        end
      rescue StandardError => e
        handle_exception(e, {
                           context: "SFTP:CHECK_CONNECTION:EXCEPTION",
                           type: "error"
                         })
        failure_status(e)
      end

      def discover(connection_config)
        connection_config = connection_config.with_indifferent_access
        db = create_connection(connection_config)
        @sftp.download!(@remote_file_path, @tempfile.path)
        query = "SELECT * FROM read_csv_auto('#{@tempfile.path}')"
        records = db.query(query).columns
        catalog = Catalog.new(streams: create_streams(records.map(&:name)))
        catalog.to_multiwoven_message
      rescue StandardError => e
        handle_exception(e, {
                           context: "SFTP:DISCOVER:EXCEPTION",
                           type: "error"
                         })
      ensure
        @tempfile&.close!
      end

      def read(sync_config)
        connection_config = sync_config.source.connection_specification
        connection_config = connection_config.with_indifferent_access
        conn = create_connection(connection_config)
        query = sync_config.model.query
        query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
        query(conn, query)
      rescue StandardError => e
        handle_exception(e, {
                           context: "SFTP:READ:EXCEPTION",
                           type: "error",
                           sync_id: sync_config.sync_id,
                           sync_run_id: sync_config.sync_run_id
                         })
      end

      private

      def create_connection(connection_config)
        initialize_file_path(connection_config)
        @sftp = with_sftp_client(connection_config)
        conn = DuckDB::Database.open.connect
        conn.execute(INSTALL_HTTPFS_QUERY)
        conn
      end

      def initialize_file_path(connection_config)
        @remote_file_path = File.join(
          connection_config[:file_path],
          "#{connection_config[:file_name]}.#{connection_config[:format_type]}"
        )
        @tempfile = Tempfile.new(File.basename(@remote_file_path))
      end

      def with_sftp_client(connection_config, &block)
        Net::SFTP.start(
          connection_config[:host],
          connection_config[:username],
          password: connection_config[:password],
          port: connection_config.fetch(:port, 22), &block
        )
      end

      def get_results(conn, query)
        results = conn.query(query)
        hash_array_values(results)
      end

      def query(conn, query)
        @sftp.download!(@remote_file_path, @tempfile.path)
        query = query.gsub(/FROM\s+\S+/i, "FROM read_csv_auto('#{@tempfile.path}')") if query.match?(/\bFROM\b/i)
        records = get_results(conn, query)
        records.map do |row|
          RecordMessage.new(data: row, emitted_at: Time.now.to_i).to_multiwoven_message
        end
      end

      def hash_array_values(describe)
        keys = describe.columns.map(&:name)
        describe.map do |row|
          Hash[keys.zip(row)]
        end
      end

      def create_streams(records)
        group_by_table(records).map do |_, r|
          Multiwoven::Integrations::Protocol::Stream.new(name: r[:tablename], action: StreamAction["fetch"], json_schema: convert_to_json_schema(r[:columns]))
        end
      end

      def group_by_table(records)
        result = {}
        records.each_with_index do |column, index|
          table_name = @remote_file_path
          column_data = {
            column_name: column,
            type: "string",
            optional: true
          }
          result[index] ||= {}
          result[index][:tablename] = table_name
          result[index][:columns] = [column_data]
        end
        result
      end
    end
  end
end