lib/rouge/lexers/stata.rb



# -*- coding: utf-8 -*- #
# frozen_string_literal: true

module Rouge
  module Lexers
    class Stata < RegexLexer
      title "Stata"
      desc "The Stata programming language (www.stata.com)"
      tag 'stata'
      filenames '*.do', '*.ado'
      mimetypes 'application/x-stata', 'text/x-stata'

      ###
      # Stata reference manual is available online at: https://www.stata.com/features/documentation/
      ###

      # Partial list of common programming and estimation commands, as of Stata 16
      # Note: not all abbreviations are included
      KEYWORDS = %w(
        do run include clear assert set mata log
        by bys bysort cap capt capture char class classutil which cdir confirm new existence creturn
        _datasignature discard di dis disp displ displa display ereturn error _estimates exit file open read write seek close query findfile fvexpand
        gettoken java home heapmax java_heapmax icd9 icd9p icd10 icd10cm icd10pcs initialize javacall levelsof
        tempvar tempname tempfile macro shift uniq dups retokenize clean sizeof posof
        makecns matcproc marksample mark markout markin svymarkout matlist
        accum define dissimilarity eigenvalues get rowjoinbyname rownames score svd symeigen dir list ren rename
        more pause plugin call postfile _predict preserve restore program define drop end python qui quietly noi noisily _return return _rmcoll rmsg _robust
        serset locale_functions locale_ui signestimationsample checkestimationsample sleep syntax sysdir adopath adosize
        tabdisp timer tokenize trace unab unabcmd varabbrev version viewsource
        window fopen fsave manage menu push stopbox
        net from cd link search install sj stb ado update uninstall pwd ssc ls
        using insheet outsheet mkmat svmat sum summ summarize
        graph gr_edit twoway histogram kdensity spikeplot
        mi miss missing var varname order compress append
        gen gene gener genera generat generate egen replace duplicates
        estimates nlcom lincom test testnl predict suest
        _regress reg regr regre regres regress probit logit ivregress logistic svy gmm ivprobit ivtobit
        bsample assert codebook collapse compare contract copy count cross datasignature d ds desc describe destring tostring
        drawnorm edit encode decode erase expand export filefilter fillin format frame frget frlink gsort
        import dbase delimited excel fred haver sas sasxport5 sasxport8 spss infile infix input insobs inspect ipolate isid
        joinby label language labelbook lookfor memory mem merge mkdir mvencode notes obs odbc order outfile
        pctile xtile _pctile putmata range recast recode rename group reshape rm rmdir sample save saveold separate shell snapshot sort split splitsample stack statsby sysuse
        type unicode use varmanage vl webuse xpose zipfile
        number keep tab table tabulate stset stcox tsset xtset
      )

      # Complete list of functions by name, as of Stata 16
      PRIMITIVE_FUNCTIONS = %w(
        abbrev abs acos acosh age age_frac asin asinh atan atan2 atanh autocode
        betaden binomial binomialp binomialtail binormal birthday bofd byteorder
        c _caller cauchy cauchyden cauchytail Cdhms ceil char chi2 chi2den chi2tail Chms
        chop cholesky clip Clock clock clockdiff cloglog Cmdyhms Cofc cofC Cofd cofd coleqnumb
        collatorlocale collatorversion colnfreeparms colnumb colsof comb cond corr cos cosh
        daily date datediff datediff_frac day det dgammapda dgammapdada dgammapdadx dgammapdxdx dhms
        diag diag0cnt digamma dofb dofC dofc dofh dofm dofq dofw dofy dow doy dunnettprob e el epsdouble
        epsfloat exp expm1 exponential exponentialden exponentialtail
        F Fden fileexists fileread filereaderror filewrite float floor fmtwidth frval _frval Ftail
        fammaden gammap gammaptail get hadamard halfyear halfyearly has_eprop hh hhC hms hofd hours
        hypergeometric hypergeometricp
        I ibeta ibetatail igaussian igaussianden igaussiantail indexnot inlist inrange int inv invbinomial invbinomialtail
        invcauchy invcauchytail invchi2 invchi2tail invcloglog invdunnettprob invexponential invexponentialtail invF
        invFtail invgammap invgammaptail invibeta invibetatail invigaussian invigaussiantail invlaplace invlaplacetail
        invlogistic invlogistictail invlogit invnbinomial invnbinomialtail invnchi2 invnchi2tail invnF invnFtail invnibeta invnormal invnt invnttail
        invpoisson invpoissontail invsym invt invttail invtukeyprob invweibull invweibullph invweibullphtail invweibulltail irecode islepyear issymmetric
        J laplace laplaceden laplacetail ln ln1m ln1p lncauchyden lnfactorial lngamma lnigammaden lnigaussianden lniwishartden lnlaplaceden lnmvnormalden
        lnnormal lnnormalden lnnormalden lnnormalden lnwishartden log log10 log1m log1p logistic logisticden logistictail logit
        matmissing matrix matuniform max maxbyte maxdouble maxfloat maxint maxlong mdy mdyhms mi min minbyte mindouble minfloat minint minlong minutes
        missing mm mmC mod mofd month monthly mreldif msofhours msofminutes msofseconds
        nbetaden nbinomial nbinomialp nbinomialtail nchi2 nchi2den nchi2tail nextbirthday nextleapyear nF nFden nFtail nibeta
        normal normalden npnchi2 npnF npnt nt ntden nttail nullmat
        plural poisson poissonp poissontail previousbirthday previousleapyear qofd quarter quarterly r rbeta rbinomial rcauchy rchi2 recode
        real regexm regexr regexs reldif replay return rexponential rgamma rhypergeometric rigaussian rlaplace rlogistic rnormal
        round roweqnumb rownfreeparms rownumb rowsof rpoisson rt runiform runiformint rweibull rweibullph
        s scalar seconds sign sin sinh smallestdouble soundex soundex_nara sqrt ss ssC strcat strdup string stritrim strlen strlower
        strltrim strmatch strofreal strpos strproper strreverse strrpos strrtrim strtoname strtrim strupper subinstr subinword substr sum sweep
        t tan tanh tC tc td tden th tin tm tobytes tq trace trigamma trunc ttail tukeyprob tw twithin
        uchar udstrlen udsubstr uisdigit uisletter uniform ustrcompare ustrcompareex ustrfix ustrfrom ustrinvalidcnt ustrleft ustrlen ustrlower
        ustrltrim ustrnormalize ustrpos ustrregexm ustrregexra ustrregexrf ustrregexs ustrreverse ustrright ustrrpos ustrrtrim ustrsortkey
        ustrsortkeyex ustrtitle ustrto ustrtohex ustrtoname ustrtrim ustrunescape ustrupper ustrword ustrwordcount usubinstr usubstr
        vec vecdiag week weekly weibull weibullden weibullph weibullphden weibullphtail weibulltail wofd word wordbreaklocale wordcount
        year yearly yh ym yofd yq yw
      )

      # Note: types `str1-str2045` handled separately below
      def self.type_keywords
        @type_keywords ||= Set.new %w(byte int long float double str strL numeric string integer scalar matrix local global numlist varlist newlist)
      end

      # Stata commands used with braces. Includes all valid abbreviations for 'forvalues'.
      def self.reserved_keywords
        @reserved_keywords ||= Set.new %w(if else foreach forv forva forval forvalu forvalue forvalues to while in of continue break nobreak)
      end

      ###
      # Lexer state and rules
      ###
      state :root do

        # Pre-processor commands: #
        rule %r/^\s*#.*$/, Comment::Preproc

        # Hashbang comments: *!
        rule %r/^\*!.*$/, Comment::Hashbang

        # Single-line comment: *
        rule %r/^\s*\*.*$/, Comment::Single

        # Keywords: recognize only when they are the first word
        rule %r/^\s*(#{KEYWORDS.join('|')})\b/, Keyword

        # Whitespace. Classify `\n` as `Text` to avoid interference with `Comment` and `Keyword` above
        rule(/[ \t]+/, Text::Whitespace)
        rule(/[\n\r]+/, Text)

        # In-line comment: //
        rule %r/\/\/.*?$/, Comment::Single

        # Multi-line comment: /* and */
        rule %r(/(\\\n)?[*].*?[*](\\\n)?/)m, Comment::Multiline

        # Strings indicated by compound double-quotes (`""') and double-quotes ("")
        rule %r/`"(\\.|.)*?"'/, Str::Double
        rule %r/"(\\.|.)*?"/, Str::Double

        # Format locals (`') and globals ($) as strings
        rule %r/`(\\.|.)*?'/, Str::Double
        rule %r/(?<!\w)\$\w+/, Str::Double

        # Display formats
        rule %r/\%\S+/, Name::Property

        # Additional string types: str1-str2045
        rule %r/\bstr(204[0-5]|20[0-3][0-9]|[01][0-9][0-9][0-9]|[0-9][0-9][0-9]|[0-9][0-9]|[1-9])\b/, Keyword::Type

        # Only recognize primitive functions when they are actually used as a function call, i.e. followed by an opening parenthesis
        # `Name::Builtin` would be more logical, but is not usually highlighted, so use `Name::Function` instead
        rule %r/\b(#{PRIMITIVE_FUNCTIONS.join('|')})(?=\()/, Name::Function

        # Matrix operator `..` (declare here instead of with other operators, in order to avoid conflict with numbers below)
        rule %r/\.\.(?=.*\])/, Operator

        # Numbers
        rule %r/[+-]?(\d+([.]\d+)?|[.]\d+)([eE][+-]?\d+)?/, Num

        # Factor variable and time series operators
        rule %r/\b[ICOicoLFDSlfds]\w*\./, Operator
        rule %r/\b[ICOicoLFDSlfds]\w*(?=\(.*\)\.)/, Operator

        rule %r/\w+/ do |m|
          if self.class.reserved_keywords.include? m[0]
            token Keyword::Reserved
          elsif self.class.type_keywords.include? m[0]
            token Keyword::Type
          else
            token Name
          end
        end

        rule %r/[\[\]{}();,]/, Punctuation

        rule %r([-<>?*+'^/\\!#.=~:&|]), Operator
      end
    end
  end
end