#!/usr/local/bin/perl -w
######################################################################
#
# $Id: ftimes-proximo,v 1.10 2014/07/18 06:40:45 mavrik Exp $
#
######################################################################
#
# Copyright 2009-2014 The FTimes Project, All Rights Reserved.
#
######################################################################
#
# Purpose: Locate a group of dig hits within a specified byte range.
#
######################################################################

use strict;
use File::Basename;
use FindBin qw($Bin $RealBin); use lib ("$Bin/../lib/perl5/site_perl", "$RealBin/../lib/perl5/site_perl", "/usr/local/ftimes/lib/perl5/site_perl");
use FTimes::EadRoutines;
use Getopt::Std;
use Storable qw(dclone);

BEGIN
{
  ####################################################################
  #
  # The Properties hash is essentially private. Those parts of the
  # program that wish to access or modify the data in this hash need
  # to call GetProperties() to obtain a reference.
  #
  ####################################################################

  my (%hProperties);

  sub GetProperties
  {
    return \%hProperties;
  }
}

######################################################################
#
# Main Routine
#
######################################################################

  ####################################################################
  #
  # Punch in and go to work.
  #
  ####################################################################

  my ($phProperties);

  $phProperties = GetProperties();

  $$phProperties{'Program'} = basename(__FILE__);

  ####################################################################
  #
  # FTimes fields.
  #
  ####################################################################

  my %hFTimesFields =
  (
    'footprint'   => 0,
    'gap'         => 0,
    'group'       => 0,
    'hostname'    => 0,
    'joiner'      => 0,
    'limit'       => 0,
    'name'        => 0,
    'offset'      => 0,
    'offsets'     => 0,
    'ordered'     => 0,
    'proximity'   => 0,
    'range'       => 0,
    'string'      => 0,
    'tag'         => 0,
    'tags'        => 0,
    'type'        => 0,
    'window'      => 0,
  );

  $$phProperties{'FTimesFields'} = \%hFTimesFields;

  ####################################################################
  #
  # Get Options.
  #
  ####################################################################

  my (%hOptions);

  if (!getopts('f:G:g:l:r:', \%hOptions))
  {
    Usage($$phProperties{'Program'});
  }

  ####################################################################
  #
  # An input file, '-f', is required. It can be '-' or a regular file.
  #
  ####################################################################

  my ($sFileHandle);

  if (!exists($hOptions{'f'}))
  {
    Usage($$phProperties{'Program'});
  }
  else
  {
    my $sFile = $hOptions{'f'};
    if ($sFile eq '-')
    {
      $sFileHandle = \*STDIN;
    }
    else
    {
      if (!open(FH, "< $sFile"))
      {
        print STDERR "$$phProperties{'Program'}: Error='Unable to open $sFile ($!).'\n";
        exit(2);
      }
      $sFileHandle = \*FH;
    }
  }

  ####################################################################
  #
  # A group, '-G', is conditionally required.
  #
  ####################################################################

  $$phProperties{'Group'} = (exists $hOptions{'G'}) ? $hOptions{'G'} : undef;

  ####################################################################
  #
  # A groups file, '-g', is conditionally required.
  #
  ####################################################################

  $$phProperties{'GroupsFile'} = (exists $hOptions{'g'}) ? $hOptions{'g'} : undef;

  ####################################################################
  #
  # A gap limit, '-l', is optional.
  #
  ####################################################################

  $$phProperties{'GapLimit'} = (exists $hOptions{'l'}) ? $hOptions{'l'} : 100;

  if ($$phProperties{'GapLimit'} !~ /^\d+$/ || $$phProperties{'GapLimit'} < 1)
  {
    print STDERR "$$phProperties{'Program'}: Error='Invalid gap limit ($$phProperties{'GapLimit'}). Use a decimal number greater than one.'\n";
    exit(2);
  }

  ####################################################################
  #
  # A range window, '-r', is optional.
  #
  ####################################################################

  $$phProperties{'RangeWindow'} = (exists $hOptions{'r'}) ? $hOptions{'r'} : 100;

  if ($$phProperties{'RangeWindow'} !~ /^(?:\d+|infinity)$/)
  {
    print STDERR "$$phProperties{'Program'}: Error='Invalid range ($$phProperties{'RangeWindow'}). Use a decimal number or \"infinity\".'\n";
    exit(2);
  }

  ####################################################################
  #
  # If any arguments remain, it's an error.
  #
  ####################################################################

  if (scalar(@ARGV) > 0)
  {
    Usage($$phProperties{'Program'});
  }

  ####################################################################
  #
  # If a group was specified, add it to the master hash.
  #
  ####################################################################

  my (%hMasterGroups, $phFileGroups, $sError);

  if (defined($$phProperties{'Group'}))
  {
    if (!AddGroup($$phProperties{'Group'}, $$phProperties{'RangeWindow'}, \%hMasterGroups, \$sError))
    {
      print STDERR "$$phProperties{'Program'}: Error='$sError'\n";
      exit(2);
    }
  }

  ####################################################################
  #
  # If a groups file was specified, read it, and add any groups.
  #
  ####################################################################

  if (defined($$phProperties{'GroupsFile'}))
  {
    if (!open(GH, "< $$phProperties{'GroupsFile'}"))
    {
      print STDERR "$$phProperties{'Program'}: Error='Unable to open groups file ($!).'\n";
      exit(2);
    }
    while (my $sLine = <GH>)
    {
      $sLine =~ s/[\r\n]+$//;
      next if ($sLine =~ /^#/ || $sLine =~ /^\s*$/);
      if (!AddGroup($sLine, $$phProperties{'RangeWindow'}, \%hMasterGroups, \$sError))
      {
        print STDERR "$$phProperties{'Program'}: Error='$sError'\n";
        exit(2);
      }
    }
    close(GH);
  }

  ####################################################################
  #
  # It's an error if the master hash is empty.
  #
  ####################################################################

  if (scalar(keys(%hMasterGroups)) < 1)
  {
    print STDERR "$$phProperties{'Program'}: Error='Need at least one group to proceed.'\n";
    exit(2);
  }

  ##################################################################
  #
  # Process the header.
  #
  ##################################################################

  my (@aHeaderFields, $sHeader, $sHeaderFieldCount);

  if (!defined($sHeader = <$sFileHandle>))
  {
    print STDERR "$$phProperties{'Program'}: Error='Header not defined.'\n";
    exit(2);
  }
  $sHeader =~ s/[\r\n]+$//;
  @aHeaderFields = split(/\|/, $sHeader);
  $sHeaderFieldCount = scalar(@aHeaderFields);
  for (my $sIndex = 0; $sIndex < $sHeaderFieldCount; $sIndex++)
  {
    if (!exists($hFTimesFields{$aHeaderFields[$sIndex]}))
    {
      print STDERR "$$phProperties{'Program'}: Error='Field ($aHeaderFields[$sIndex]) not recognized.'\n";
      exit(2);
    }
    if ($aHeaderFields[$sIndex] =~ /^footprint$/i)
    {
      $$phProperties{'FootprintIndex'} = $sIndex;
    }
    elsif ($aHeaderFields[$sIndex] =~ /^hostname$/i)
    {
      $$phProperties{'HostIndex'} = $sIndex;
    }
    elsif ($aHeaderFields[$sIndex] =~ /^name$/i)
    {
      $$phProperties{'NameIndex'} = $sIndex;
    }
    elsif ($aHeaderFields[$sIndex] =~ /^offset$/i)
    {
      $$phProperties{'OffsetIndex'} = $sIndex;
    }
    elsif ($aHeaderFields[$sIndex] =~ /^string$/i)
    {
      $$phProperties{'StringIndex'} = $sIndex;
    }
    elsif ($aHeaderFields[$sIndex] =~ /^tag$/i || $aHeaderFields[$sIndex] =~ /^group$/i)
    {
      $$phProperties{'TagIndex'} = $sIndex;
    }
  }
  if
  (
    (!defined($$phProperties{'FootprintIndex'}) && !defined($$phProperties{'StringIndex'})) ||
    !defined($$phProperties{'NameIndex'}) ||
    !defined($$phProperties{'OffsetIndex'}) ||
    !defined($$phProperties{'TagIndex'})
  )
  {
    print STDERR "$$phProperties{'Program'}: Header='$sHeader' Error='Invalid header or unable to locate the required fields.'\n";
    exit(2);
  }

  ##################################################################
  #
  # Write the output header.
  #
  ##################################################################

  my @aOutputHeaderFields = qw(name group ordered proximity gap limit range window footprint offset offsets tags);
  if (defined($$phProperties{'HostIndex'}))
  {
    unshift(@aOutputHeaderFields, "hostname");
  }
  print join("|", @aOutputHeaderFields), "\n";

  ####################################################################
  #
  # Process the records. Dig records must be sorted by name and offset
  # for this technique to work.
  #
  ####################################################################

  my (@aRecordFields, $sRecord, $sRecordFieldCount);

  my $sLastHost = "";
  my $sLastName = "";

  while (my $sRecord = <$sFileHandle>)
  {
    $sRecord =~ s/[\r\n]+$//;
    @aRecordFields = split(/\|/, $sRecord, -1); # Use large chunk size to preserve trailing NULL fields.
    $sRecordFieldCount = scalar(@aRecordFields);
    if ($sRecordFieldCount != $sHeaderFieldCount)
    {
      print STDERR "$$phProperties{'Program'}: Line='$sRecord' Warning='Invalid field count ($sRecordFieldCount != $sHeaderFieldCount). Record will be ignored.'\n";
      next;
    }
    my $sHost = (defined($$phProperties{'HostIndex'})) ? $aRecordFields[$$phProperties{'HostIndex'}] : "";
    my $sName = $aRecordFields[$$phProperties{'NameIndex'}];
    my $sTag = $aRecordFields[$$phProperties{'TagIndex'}];
    my $sOffset = $aRecordFields[$$phProperties{'OffsetIndex'}];
    my $sLength = (defined($$phProperties{'StringIndex'})) ? length(EadFTimesUrlDecode($aRecordFields[$$phProperties{'StringIndex'}])) : $aRecordFields[$$phProperties{'FootprintIndex'}];
    if ($sName ne $sLastName || $sHost ne $sLastHost)
    {
      $phFileGroups = dclone(\%hMasterGroups); # Deep clone.
      $sLastHost = $sHost;
      $sLastName = $sName;
    }
    if (!defined($sTag) || length($sTag) < 1)
    {
      print STDERR "$$phProperties{'Program'}: Line='$sRecord' Warning='Tag is missing. Record will be ignored.'\n";
      next;
    }
    foreach my $sGroup (keys(%$phFileGroups))
    {
      if (exists($$phFileGroups{$sGroup}{'Offsets'}{$sTag}))
      {
        $$phFileGroups{$sGroup}{'Offsets'}{$sTag} = $sOffset;
        $$phFileGroups{$sGroup}{'Lengths'}{$sTag} = $sLength;
        my @aOffsets = sort({ $a <=> $b } values(%{$$phFileGroups{$sGroup}{'Offsets'}}));
        my $sLower = $aOffsets[0];
        my $sUpper = $aOffsets[$#aOffsets];
        my $sRange = $sUpper - $sLower;
        my $sRangeIsInfinity = ($$phFileGroups{$sGroup}{'RangeWindow'} eq "infinity") ? 1 : 0;
        if ($sLower >= 0 && $sUpper >= 0 && ($sRangeIsInfinity || $sRange <= $$phFileGroups{$sGroup}{'RangeWindow'}))
        {
          my @aTags = sort({ $$phFileGroups{$sGroup}{'Offsets'}{$a} <=> $$phFileGroups{$sGroup}{'Offsets'}{$b} } keys(%{$$phFileGroups{$sGroup}{'Offsets'}}));
          my %hInnerSlots = ();
          my %hUpperSlots = ();
          foreach my $sKey (@aTags)
          {
            my $sFirst = $$phFileGroups{$sGroup}{'Offsets'}{$sKey} - $sLower;
            my $sFinal = $$phFileGroups{$sGroup}{'Lengths'}{$sKey} + $sFirst - 1;
            if ($$phFileGroups{$sGroup}{'Offsets'}{$sKey} < $sUpper)
            {
              $sFinal = $sRange - 1 if ($sRange > 0 && $sFinal >= $sRange);
              foreach my $sSlot ($sFirst .. $sFinal)
              {
                $hInnerSlots{$sSlot}++;
              }
            }
            else
            {
              foreach my $sSlot ($sFirst .. $sFinal)
              {
                $hUpperSlots{$sSlot}++;
              }
            }
          }
          my $sOffsetList = join(",", @aOffsets);
          my $sTagList = join(",", @aTags);
          my $sGap = ($sRange - scalar(keys(%hInnerSlots))) / (scalar(@aOffsets) - 1); # This is an average.
          my $sFootprint = $sRange + scalar(keys(%hUpperSlots));
          my $sGapLimit = $$phProperties{'GapLimit'};
          my $sProximityGap = ($sGap < $sGapLimit) ? $sGap : $sGapLimit;
          my @aOutputRecordFields =
          (
            $sName,
            $sGroup,
            ($sTagList eq $hMasterGroups{$sGroup}{'TagList'}) ? "y" : "n",
            sprintf("%.2f", (($sGapLimit - $sProximityGap) / $sGapLimit)),
            sprintf("%.2f", $sGap),
            $sGapLimit,
            $sRange,
            $$phFileGroups{$sGroup}{'RangeWindow'},
            $sFootprint,
            $sLower,
            $sOffsetList,
            $sTagList
          );
          if (defined($sHost) && length($sHost))
          {
            unshift(@aOutputRecordFields, $sHost);
          }
          print join("|", @aOutputRecordFields), "\n";
        }
      }
    }
  }

  ####################################################################
  #
  # Clean up and go home.
  #
  ####################################################################

  close($sFileHandle);

  1;


######################################################################
#
# AddGroup
#
######################################################################

sub AddGroup
{
  my ($sGroupKvp, $sMasterRange, $phMasterGroups, $psError) = @_;

  ####################################################################
  #
  # Parse and add the specified group. Note that each tag in a group
  # must be initialized using a negative value. This ensures that the
  # numeric sorts done during dig record processing work as expected.
  #
  ####################################################################

  my ($sGroup, $sValue) = split(/=/, $sGroupKvp);
  my ($sTagList, $sRange) = split(/:/, $sValue, -1);
  if (!exists($$phMasterGroups{$sGroup}))
  {
    my @aTags = split(/,/, $sTagList);
    if (scalar(@aTags) < 2)
    {
      $$psError = "Group ($sGroup) must have at least two tags.";
      return undef;
    }
    foreach my $sTag (@aTags)
    {
      if (exists($$phMasterGroups{$sGroup}{'Offsets'}{$sTag}))
      {
        $$psError = "Group ($sGroup) has a duplicate tag ($sTag).";
        return undef;
      }
      $$phMasterGroups{$sGroup}{'Offsets'}{$sTag} = -1;
      $$phMasterGroups{$sGroup}{'Lengths'}{$sTag} =  0;
    }
    $$phMasterGroups{$sGroup}{'TagList'} = $sTagList;
    if (!defined($sRange))
    {
      $$phMasterGroups{$sGroup}{'RangeWindow'} = $sMasterRange;
    }
    else
    {
      if ($sRange !~ /^(?:\d+|infinity)$/)
      {
        $$psError = "Group ($sGroup) has an invalid range ($sRange). Use a decimal number or \"infinity\".";
        return undef;
      }
      $$phMasterGroups{$sGroup}{'RangeWindow'} = $sRange;
    }
  }
  else
  {
    $$psError = "Group ($sGroup) is already defined.";
    return undef;
  }
}


######################################################################
#
# Usage
#
######################################################################

sub Usage
{
  my ($sProgram) = @_;
  print STDERR "\n";
  print STDERR "Usage: $sProgram [-l limit] [-r range] {-G group=tag,tag[,tag[,...]][:range]|-g <groups-file>} -f {file|-}\n";
  print STDERR "\n";
  exit(1);
}


=pod

=head1 NAME

ftimes-proximo - Locate a group of dig hits within a specified byte range

=head1 SYNOPSIS

B<ftimes-proximo> B<[-l limit]> B<[-r range]> B<{-G group=tag,tag[,tag[,...]][:range]|-g <groups-file>}> B<-f {file|-}>

=head1 DESCRIPTION

This utility locates a group of dig hits within a specified byte
range.  To work properly, the input must be sorted by 'hostname' (when
present), 'name', and 'offset' in ascending order.  Note that this
utility does not sort the input -- that step can be done with
ftimes-sortini(1).  The input format can vary so long as it contains
at least the 'name', 'tag', 'offset', and 'string' fields.  The two
most common formats are:

    name|type|tag|offset|string

and

    hostname|name|type|tag|offset|string|joiner

The first is produced by ftimes(1) and hipdig(1), and the second is
produced by ftimes-dig2dbi(1).  Each input record must contain a
non-null tag value -- those that don't will be ignored.  Generally,
each tag should correspond to a unique dig string.  However, tag
overloading is allowed.

This utility can also take its own output as input, thus providing a
way to analyze groups of groups.  In that case, the input must contain
at least the 'name', 'group', 'footprint', and 'offset' fields.

Output is written to stdout in one of the following formats:

    name|group|ordered|proximity|gap|limit|range|window|footprint|offset|offsets|tags

or

    hostname|name|group|ordered|proximity|gap|limit|range|window|footprint|offset|offsets|tags

The breakdown of the output format is as follows:

=over 4

=item B<hostname>

Hostname of the subject system.  This value is transferred directly
from the input stream, but only if that field is present.

=item B<name>

URL-encoded filename.  This value is transferred directly from the
input stream.

=item B<group>

Name of the group (as defined on the command line or in a group config
file) that was matched.

=item B<ordered>

Boolean value (y/n) indicating whether the actual tag order matches
the order specified in the group definition.  If order is important,
be sure to specify group definitions using the desired order.

=item B<proximity>

A value from 0.00 to 1.00 indicating the relative proximity of the dig
hits for a given group.  This value is computed as follows:

    ( <limit> - <gap> ) / <limit>

where the gap is the smaller of the specified limit (B<-l> option) or
actual gap.

=item B<gap>

The average gap, in bytes, between adjacent dig hits.

=item B<limit>

The largest average gap between dig hits for them to be considered
close.  As the actual gap approaches this number, proximity goes to
zero.

=item B<range>

The number of bytes between the lowest and highest dig offsets for a
given match.

=item B<window>

The number of bytes used to determine whether a given match is in
range or not.

=item B<footprint>

The number of bytes between the beginning of the first and end of the
last dig hits (inclusive).

=item B<offset>

The offset of the group hit.  This corresponds to the lowest offset
within a group for a given match (hit).

=item B<offsets>

Comma delimited list of dig offsets in the order they were found.

=item B<tags>

Comma delimited list of dig tags in the order they were found.

=back

The trigger event for generating an output record is a group match.
Each time a member offset changes for a given group, the entire group
is evaluated to see if the resulting set of offsets fall within the
specified range.  If that condition is met, then an output record is
generated.

=head1 OPTIONS

=over 4

=item B<-f {file|-}>

Specifies the name of the input file.  A value of '-' will cause the
program to read from stdin.

=item B<-G group=tag,tag[,tag[,...]][:range]>

Specifies a group definition where

=over 4

=item B<group>

The name of the group.

=item B<tag,tag[,tag[,...]]>

A comma delimited list of two or more unique dig tags.

=item B<range>

A decimal number or the word 'infinity'.  The range is optional in a
group definition.

=back

=item B<-g groups-file>

Specifies the name of a file containing one or more group definitions.
The format is the same as that used for the B<-G> option.

=item B<-l limit>

Specifies the largest average gap between dig hits for them to be
considered close.  As the gap approaches this number, proximity goes
to zero.  The default gap is 100 bytes.

=item B<-r range>

A decimal number or the word 'infinity'.  If the latter is specified,
then the range window is all bytes in a given file.  This is useful if
you simply want to determine whether or not all tags occur in a given
file.  The default range is 100 bytes.

=back

=head1 CAVEATS

Group matching only maintains (i.e., remembers) the last offset of
each group member.  This means that there are cases where a single
group could have multiple matches in a specified range, but only one
is reported.  For example, suppose you have the following group
definition:

    g_test=a1,b2,c3,d4:100

Now suppose that you have the following dig records:

    name|type|tag|offset|string
    "file"|normal|a1|10|a1
    "file"|normal|b2|20|b2
    "file"|normal|c3|30|c3
    "file"|normal|a1|40|a1
    "file"|normal|d4|50|d4

In this case, one could say that the group matches twice within the
specified range of 100 bytes.  Once for offsets 10, 20, 30, and 50,
and once for offsets 20, 30, 40, and 50.  Since this utility only
maintains the last offset of each group member, only the second set of
offsets is considered a match.  This happens because the 'a1' offset
is reset from 10 to 40 when the fourth record (not counting the
header) is porcessed.  Effectively, this means that given two
potential matches within a specified range, the match where the
offsets are the closest always wins.

=head1 AUTHOR

Klayton Monroe

=head1 SEE ALSO

ftimes(1), ftimes-dig2ctx(1), ftimes-dig2dbi(1), ftimes-sortini(1), hipdig(1)

=head1 HISTORY

This utility was initially written to perform proximity analysis in a
case where we needed to identify last names in close proximity to
their respective Social Security Numbers (SSN).

This utility first appeared in FTimes 3.9.0.

=head1 LICENSE

All documentation and code are distributed under same terms and
conditions as FTimes.

=cut
