module SportDb
class MatchParser ## simple match parser for team match schedules
def self.debug=(value) @@debug = value; end
def self.debug?() @@debug ||= false; end ## note: default is FALSE
def debug?() self.class.debug?; end
include Logging ## e.g. logger#debug, logger#info, etc.
def self.parse( lines, start: )
## todo/fix: add support for txt and lines
## check if lines_or_txt is an array or just a string
## use teams: like start: why? why not?
parser = new( lines, start )
parser.parse
end
def _read_lines( txt ) ## todo/check: add alias preproc_lines or build_lines or prep_lines etc. - why? why not?
## returns an array of lines with comments and empty lines striped / removed
lines = []
txt.each_line do |line| ## preprocess
line = line.strip
next if line.empty? || line.start_with?('#') ### skip empty lines and comments
line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
lines << line
end
lines
end
## note: colon (:) MUST be followed by one (or more) spaces
## make sure mon feb 12 18:10 will not match
## allow 1. FC Köln etc.
## Mainz 05:
## limit to 30 chars max
## only allow chars incl. intl buut (NOT ()[]/;)
##
## Group A:
## Group B: - remove colon
## or lookup first
ATTRIB_RE = %r{^
[ ]*? # slurp leading spaces
(?<key>[^:|\]\[()\/; -]
[^:|\]\[()\/;]{0,30}
)
[ ]*? # slurp trailing spaces
:[ ]+
(?<value>.+)
[ ]*? # slurp trailing spaces
$
}ix
#
# todo/fix: change start to start: too!!!
# might be optional in the future!! - why? why not?
def initialize( lines, start )
# for convenience split string into lines
## note: removes/strips empty lines
## todo/check: change to text instead of array of lines - why? why not?
## note - wrap in enumerator/iterator a.k.a lines reader
@lines = lines.is_a?( String ) ?
_read_lines( lines ) : lines
@start = start
@errors = []
end
attr_reader :errors
def errors?() @errors.size > 0; end
def parse
## note: every (new) read call - resets errors list to empty
@errors = []
@last_date = nil
@last_time = nil
@last_round = nil
@last_group = nil
## last_goals - rename to (longer) @last_team_goals or such - why? why not?
@last_goals = 1 ## toggle between 1|2 - hacky (quick & dirty) support for multi-line goals, fix soon!
@teams = Hash.new(0) ## track counts (only) for now for (interal) team stats - why? why not?
@rounds = {}
@groups = {}
@matches = []
@warns = [] ## track list of warnings (unmatched lines) too - why? why not?
@parser = Parser.new
@tree = []
attrib_found = false
@lines.each_with_index do |line,i|
if debug?
puts
puts "line >#{line}<"
end
## skip new (experimental attrib syntax)
if attrib_found == false &&
ATTRIB_RE.match?( line )
## note: check attrib regex AFTER group def e.g.:
## Group A:
## Group B: etc.
## todo/fix - change Group A: to Group A etc.
## Group B: to Group B
attrib_found = true
## logger.debug "skipping key/value line - >#{line}<"
next
end
if attrib_found
## check if line ends with dot
## if not slurp up lines to the next do!!!
## logger.debug "skipping key/value line - >#{line}<"
attrib_found = false if line.end_with?( '.' )
# logger.debug "skipping key/value line (cont.) - >#{line}<"
next
end
t, error_messages = @parser.parse_with_errors( line )
if error_messages.size > 0
## add to "global" error list
## make a triplet tuple (file / msg / line text)
error_messages.each do |msg|
@errors << [ '<file>', ## add filename here
msg,
line
]
end
end
pp t if debug?
@tree << t
end # each lines
## pp @tree
## report parse errors here - why? why not?
@tree.each do |nodes|
node_type = nodes[0][0] ## get node type of first/head node
if node_type == :round_def
## todo/fix: add round definition (w begin n end date)
## todo: do not patch rounds with definition (already assume begin/end date is good)
## -- how to deal with matches that get rescheduled/postponed?
parse_round_def( nodes )
elsif node_type == :group_def ## NB: group goes after round (round may contain group marker too)
### todo: add pipe (|) marker (required)
parse_group_def( nodes )
elsif node_type == :player ||
node_type == :none # e.g [[:none], [:";"], [:player, "Xhaka"],...]
## note - for now goals line MUST start with player!!
parse_goals( nodes )
else
## try to be liberal/flexible
## eat-up nodes as we go
## assume match with group / round header
## etc. on its own line or not
## preprocess possible before match nodes
while !nodes.empty? do
node_type = nodes[0][0] ## get node type of first/head node
if node_type == :round
node = nodes.shift ## eat-up
parse_round_header( node )
elsif node_type == :leg
node = nodes.shift ## eat-up
## ignore (round) leg for now - add later leg - 1|2|3 etc!!!
## needs to get added to db/schema too!!!!
## add @last_leg = nil or 1|2|3 etc.
elsif node_type == :group
## -- lets you set group e.g. Group A etc.
node = nodes.shift ## eat-up
parse_group_header( node )
elsif node_type == :date
node = nodes.shift ## eat-up
parse_date_header( node )
## add time here too - why? why not?
## add skip comma separator here too - why? why not?
## "slurp-up" in upstream parser?
## e.g. round, group or group, round ?
else
break
end
end
next if nodes.empty?
## rename to try_parse_match - why? why not?
parse_match( nodes )
end
end # tree.each
## note - team keys are names and values are "internal" stats!!
## and NOT team/club/nat_team structs!!
[@teams.keys, @matches, @rounds.values, @groups.values]
end # method parse
def parse_group_header( node )
logger.debug "parsing group header: >#{node}<"
# note: group header resets (last) round (allows, for example):
# e.g.
# Group Playoffs/Replays -- round header
# team1 team2 -- match
# Group B -- group header
# team1 team2 - match (will get new auto-matchday! not last round)
@last_round = nil
name = node[1]
group = @groups[ name ]
if group.nil?
puts "!! PARSE ERROR - no group def found for >#{name}<"
exit 1
end
# set group for games
@last_group = group
end
def parse_group_def( nodes )
logger.debug "parsing group def: >#{nodes}<"
## e.g
## [:group_def, "Group A"],
## [:team, "Germany"],
## [:team, "Scotland"],
## [:team, "Hungary"],
## [:team, "Switzerland"]
node = nodes[0]
name = node[1] ## group name
teams = nodes[1..-1].map do |node|
if node[0] == :team
team = node[1]
@teams[ team ] += 1
team
else
puts "!! PARSE ERROR - only teams expected in group def; got:"
pp nodes
exit 1
end
end
## todo/check/fix: add back group key - why? why not?
group = Import::Group.new( name: name,
teams: teams )
@groups[ name ] = group
end
def _build_date( m:, d:, y:, start: )
## quick debug hack
if m == 2 && d == 29
puts "quick check feb/29 dates"
pp [d,m,y]
pp start
end
if y.nil? ## try to calculate year
y = if m > start.month ||
(m == start.month && d >= start.day)
# assume same year as start_at event (e.g. 2013 for 2013/14 season)
start.year
else
# assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
start.year+1
end
end
Date.new( y,m,d ) ## y,m,d
end
def parse_round_def( nodes )
logger.debug "parsing round def: >#{nodes}<"
## e.g. [[:round_def, "Matchday 1"], [:duration, "Fri Jun/14 - Tue Jun/18"]]
## [[:round_def, "Matchday 2"], [:duration, "Wed Jun/19 - Sat Jun/22"]]
## [[:round_def, "Matchday 3"], [:duration, "Sun Jun/23 - Wed Jun/26"]]
node = nodes[0]
name = node[1]
# NB: use extracted round name for knockout check
# knockout_flag = is_knockout_round?( name )
node = nodes[1]
node_type = node[0]
if node_type == :date
start_date = end_date = _build_date( m: node[2][:m],
d: node[2][:d],
y: node[2][:y],
start: @start)
elsif node_type == :duration
start_date = _build_date( m: node[2][:start][:m],
d: node[2][:start][:d],
y: node[2][:start][:y],
start: @start)
end_date = _build_date( m: node[2][:end][:m],
d: node[2][:end][:d],
y: node[2][:end][:y],
start: @start)
else
puts "!! PARSE ERROR - expected date or duration for round def; got:"
pp nodes
exit 1
end
# note: - NOT needed; start_at and end_at are saved as date only (NOT datetime)
# set hours,minutes,secs to beginning and end of day (do NOT use default 12.00)
# e.g. use 00.00 and 23.59
# start_at = start_at.beginning_of_day
# end_at = end_at.end_of_day
# note: make sure start_at/end_at is date only (e.g. use start_at.to_date)
# sqlite3 saves datetime in date field as datetime, for example (will break date compares later!)
# note - _build_date always returns Date for now - no longer needed!!
# start_date = start_date.to_date
# end_date = end_date.to_date
## fix:
## remove knockout_flag - why? why not?
knockout_flag = false
logger.debug " start_date: #{start_date}"
logger.debug " end_date: #{end_date}"
logger.debug " name: >#{name}<"
logger.debug " knockout_flag: #{knockout_flag}"
round = Import::Round.new( name: name,
start_date: start_date,
end_date: end_date,
knockout: knockout_flag,
auto: false )
@rounds[ name ] = round
end
def parse_round_header( node )
logger.debug "parsing round header: >#{node}<"
name = node[1]
# name = name.sub( ROUND_EXTRA_WORDS_RE, '' )
# name = name.strip
round = @rounds[ name ]
if round.nil? ## auto-add / create if missing
## todo/check: add num (was pos) if present - why? why not?
round = Import::Round.new( name: name )
@rounds[ name ] = round
end
## todo/check: if pos match (MUST always match for now)
@last_round = round
@last_group = nil # note: reset group to no group - why? why not?
## todo/fix/check
## make round a scope for date(time) - why? why not?
## reset date/time e.g. @last_date = nil !!!!
end
def parse_date_header( node )
logger.debug( "date header: >#{node}<")
date = _build_date( m: node[2][:m],
d: node[2][:d],
y: node[2][:y],
start: @start )
logger.debug( " date: #{date} with start: #{@start}")
@last_date = date # keep a reference for later use
@last_time = nil
### quick "corona" hack - support seasons going beyond 12 month (see swiss league 2019/20 and others!!)
## find a better way??
## set @start date to full year (e.g. 1.1.) if date.year is @start.year+1
## todo/fix: add to linter to check for chronological dates!! - warn if NOT chronological
### todo/check: just turn on for 2019/20 season or always? why? why not?
## todo/fix: add switch back to old @start_org
## if year is date.year == @start.year-1 -- possible when full date with year set!!!
=begin
if @start.month != 1
if date.year == @start.year+1
logger.debug( "!! hack - extending start date to full (next/end) year; assumes all dates are chronologigal - always moving forward" )
@start_org = @start ## keep a copy of the original (old) start date - why? why not? - not used for now
@start = Date.new( @start.year+1, 1, 1 )
end
end
=end
end
def parse_minutes( nodes )
## parse goals by player
## may have multiple minutes!!
goals = []
node = nodes.shift ## get player
name = node[1]
loop do
goal = {}
goal[:name] = name
node_type = nodes[0][0]
if node_type != :minute
puts "!! PARSE ERROR - minute expected to follow player (in goal); got #{node_type}:"
pp nodes
exit 1
end
node = nodes.shift
goal[:minute] = node[2][:m]
goal[:offset] = node[2][:offset] if node[2][:offset]
## check for own goal or penalty or such
if !nodes.empty?
node_type = nodes[0][0]
if node_type == :og
nodes.shift
goal[:og] = true
elsif node_type == :pen
nodes.shift
goal[:pen] = true
else
# do nothing
end
end
goals << goal
## check if another minute ahead; otherwise break
break if nodes.empty?
node_type = nodes[0][0]
## Kane 39', 62', 67'
## consume/eat-up (optional?) commas
if node_type == :','
nodes.shift
node_type = nodes[0][0]
end
break if node_type != :minute
end
goals
end
def parse_goals( nodes )
logger.debug "parse goals: >#{nodes}<"
goals1 = []
goals2 = []
while !nodes.empty?
node_type = nodes[0][0]
if node_type == :player
more_goals = parse_minutes( nodes )
## hacky multi-line support for goals
## using last_goal (1|2)
@last_goals == 2 ? goals2 += more_goals :
goals1 += more_goals
elsif node_type == :';' ## team separator
nodes.shift # eat-up
@last_goals = 2
elsif node_type == :none
nodes.shift # eat-up
else
puts "!! PARSE ERROR - unexpected node type in goals;; got #{node_type}:"
pp nodes
exit 1
end
end
pp [goals1,goals2] if debug?
## wrap in struct andd add/append to match
=begin
class GoalStruct
######
# flat struct for goals - one entry per goals
attr_accessor :name
attr_accessor :team # 1 or 2 ? check/todo: add team1 or team2 flag?
attr_accessor :minute, :offset
attr_accessor :penalty, :owngoal
attr_accessor :score1, :score2 # gets calculated
=end
goals = []
goals1.each do |rec|
goal = Import::Goal.new(
player: rec[:name],
team: 1,
minute: rec[:minute],
offset: rec[:offset],
penalty: rec[:pen] || false, # note: pass along/use false NOT nil
owngoal: rec[:og] || false
)
goals << goal
end
goals2.each do |rec|
goal = Import::Goal.new(
player: rec[:name],
team: 2,
minute: rec[:minute],
offset: rec[:offset],
penalty: rec[:pen] || false, # note: pass along/use false NOT nil
owngoal: rec[:og] || false
)
goals << goal
end
pp goals if debug?
## quick & dirty - auto add goals to last match
## note - for hacky (quick& dirty) multi-line support
## always append for now
match = @matches[-1]
match.goals ||= []
match.goals += goals
## todo/fix
## sort by minute
## PLUS auto-fill score1,score2 - why? why not?
end
def parse_match( nodes )
logger.debug( "parse match: >#{nodes}<" )
## collect (possible) nodes by type
num = nil
date = nil
time = nil
teams = []
score = nil
more = []
status = nil
while !nodes.empty?
node = nodes.shift
node_type = node[0]
if node_type == :num
num = node[1]
elsif node_type == :date
## note: date wipes out/clear time
## time MUST always come after date
time = nil
date = _build_date( m: node[2][:m],
d: node[2][:d],
y: node[2][:y],
start: @start )
elsif node_type == :time
## note - there's no time (-only) type in ruby
## use string (e.g. '14:56', '1:44')
## use 01:44 or 1:44 ?
## check for 0:00 or 24:00 possible?
time = '%d:%02d' % [node[2][:h], node[2][:m]]
elsif node_type == :team
teams << node[1]
elsif node_type == :score
### todo/fix
## add keywords (e.g. ht, ft or such) to Score.new - why? why not?
## or use new Score.build( ht:, ft:, ) or such - why? why not?
ht = node[2][:ht] || [nil,nil]
ft = node[2][:ft] || [nil,nil]
et = node[2][:et] || [nil,nil]
p = node[2][:p] || [nil,nil]
values = [*ht, *ft, *et, *p]
## pp values
score = Score.new( *values )
## pp score
elsif node_type == :status # e.g. awarded, canceled, postponed, etc.
status = node[1]
elsif node_type == :vs
## skip; do nothing
##
## todo - add ## find (optional) match status e.g. [abandoned] or [replay] or [awarded]
## or [cancelled] or [postponed] etc.
## status = find_status!( line ) ## todo/check: allow match status also in geo part (e.g. after @) - why? why not?
elsif node_type == :'@' ||
node_type == :',' ||
node_type == :geo ||
node_type == :timezone
## e.g.
## [:"@"], [:geo, "Stade de France"], [:","], [:geo, "Saint-Denis"]]
## [:"@"], [:geo, "Arena de São Paulo"], [:","], [:geo, "São Paulo"], [:timezone, "(UTC-3)"]
more << node[1] if node_type == :geo
else
puts "!! PARSE ERROR - unexpected node type #{node_type} in match line; got:"
pp node
## exit 1
@errors << ["PARSE ERROR - unexpected node type #{node_type} in match line; got: #{node.inspect}"]
return
end
end
if teams.size != 2
puts "!! PARSE ERROR - expected two teams; got #{teams.size}:"
pp teams
## exit 1
@errors << ["PARSE ERROR - expected two teams; got #{teams.size}: #{teams.inspect}"]
return
end
team1 = teams[0]
team2 = teams[1]
@teams[ team1 ] += 1
@teams[ team2 ] += 1
###
# check if date found?
# note: ruby falsey is nil & false only (not 0 or empty array etc.)
if date
### check: use date_v2 if present? why? why not?
@last_date = date # keep a reference for later use
@last_time = nil
# @last_time = nil
else
date = @last_date # no date found; (re)use last seen date
end
if time
@last_time = time
else
time = @last_time
end
round = nil
if @last_round
round = @last_round
else
## find (first) matching round by date if rounds / matchdays defined
## if not rounds / matchdays defined - YES, allow matches WITHOUT rounds!!!
if @rounds.size > 0
@rounds.values.each do |round_rec|
## note: convert date to date only (no time) with to_date!!!
if (round_rec.start_date && round_rec.end_date) &&
(date.to_date >= round_rec.start_date &&
date.to_date <= round_rec.end_date)
round = round_rec
break
end
end
if round.nil?
puts "!! PARSE ERROR - no matching round found for match date:"
pp date
exit 1
end
end
end
## todo/check: scores are integers or strings?
## todo/check: pass along round and group refs or just string (canonical names) - why? why not?
## split date in date & time if DateTime
=begin
time_str = nil
date_str = nil
if date.is_a?( DateTime )
date_str = date.strftime('%Y-%m-%d')
time_str = date.strftime('%H:%M')
elsif date.is_a?( Date )
date_str = date.strftime('%Y-%m-%d')
else # assume date is nil
end
=end
time_str = nil
date_str = nil
date_str = date.strftime('%Y-%m-%d') if date
time_str = time if date && time
ground = nil
@matches << Import::Match.new( num: num,
date: date_str,
time: time_str,
team1: team1, ## note: for now always use mapping value e.g. rec (NOT string e.g. team1.name)
team2: team2, ## note: for now always use mapping value e.g. rec (NOT string e.g. team2.name)
score: score,
round: round ? round.name : nil, ## note: for now always use string (assume unique canonical name for event)
group: @last_group ? @last_group.name : nil, ## note: for now always use string (assume unique canonical name for event)
status: status,
ground: ground )
### todo: cache team lookups in hash?
## hacky goals support
### reset/toggle 1/2
@last_goals = 1
end
end # class MatchParser
end # module SportDb