# Copyright 2009 Alexandra Klepatsch
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# facileOCR version: 0.5

package facileOCR;

use strict;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Plugin;
use File::Temp qw/ tempfile tempdir /;

our @ISA = qw (Mail::SpamAssassin::Plugin);
my $maxsize = 50;

# constructor: register the eval rule
sub new {
   my ( $class, $mailsa ) = @_;
   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsa);
   $self->get_config($mailsa->{conf});
   bless( $self, $class );
   $self->register_eval_rule("check_ocr");
   return $self;
}

sub get_config {
# whatever was set in config, default values otherwise
my ($self, $conf, $pms ) = @_;
my @opts = ();
  push(@opts, {
                setting => 'spamwords',
                is_admin => 1,
                default => "viagra,cialis",
                type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
              });
  push(@opts, {
                setting => 'debuglog',
                is_admin => 1,
                default => "off",
                type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
                
              });
  push(@opts, {
                setting => 'ocr_profile',
                is_admin => 1,
                default =>  1,
                type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC

              });
  push(@opts, {
                setting => 'max_img_size',
                is_admin => 1,
                code  => sub {
                my ($self, $key, $value, $line) = @_;
                # avoid unrealistic or wrong values
                if ($value ne "")
                  { 
                  $value =~ s/.*[^\d].*/-1/g;
                  if ($value > 100 or $value < 10)
                  {
                  if ($value > 100) {$value = 100;}
                  elsif ($value < 10) {$value = 10;}
                  else {$value = 50;}
                  }
                  else {$maxsize = $value;}
                  }
                }
              });
$conf->{parser}->register_commands(\@opts);
}


sub check_ocr 
# call the other subs, give the score
{
   my ( $self, $pms ) = @_;
   my $dbg = $pms->{main}->{conf}->{debuglog};

   debuglog($dbg, "start with tests");

   my ($imgTextOcr, $numImages) = imageExtractionFromMSG($dbg, $pms->{msg}, $pms->{main}->{conf});
   my $cnt = 0;
   my ($configwords, @words);
   if ($numImages > 0 && $imgTextOcr ne "")
    {
         debuglog($dbg, "$numImages img found, imgTextOcr contains:\n $imgTextOcr\n");
         ($configwords = $pms->{main}->{conf}->{spamwords}) =~ s/[&*@\\\/`!{}\[\]:\^\$"='<>()%;~|.+?]//gi;
         @words = split(/,/, $configwords);
         my $w;

         foreach $w (@words) {
         while ($imgTextOcr =~ m/$w/gi) {
            $cnt++;
            }
         }
         debuglog($dbg, "number of spamwords found: $cnt");
    }
   else
    {
         debuglog($dbg, "no result");
    }
    
    if($cnt) {
        # finally give the score
        my $score = sprintf("%0.2f", $cnt * 0.8);
        $pms->got_hit("FOCR", "facileOCR: $cnt ", score => $score );
        for my $set (0..3) {
            $pms->{conf}->{scoreset}->[$set]->{"FOCR"} = $score;
        }

    }

return 0;
}

sub imageExtractionFromMSG
# ($imgTextOcr, $numImages) = imageExtractionFromMSG($msg)
# Return all text and the number of attached images
# called by check_ocr
{
    my $dbg = $_[0];
    my $msg = $_[1];
    my $conf = $_[2];
    my @mimeStr = ("image/*", "img/*");
    my $num=0;
    my $imgTextOcr = "";
    my $maxbytes = ($maxsize * 1024) + 10;
    foreach (@mimeStr)
    {
        # Search all attachments with current MIME
        my @img_parts =  $msg->find_parts($_);
        for (my $i=0; $i <= $#img_parts; $i++)
        {
            my $imagestream = $img_parts[$i]->decode($maxbytes);
             $imgTextOcr = join $imgTextOcr,  textExtractionByOCR($dbg, $imagestream, $conf);
             $num++;
        }
    }

    return ($imgTextOcr, $num);
}

sub textExtractionByOCR
# $textOut = textExtractionByOCR( $imagestream )
# tests, convert, text extraction
# called by imageExtractionFromMSG
{
    my $dbg = $_[0];
    my $imagestream = $_[1];
    my $conf = $_[2];
    my $imagelen = sprintf("%0.2f", length($imagestream) / 1024);
    my ($fh_a, $tmpImg) = tempfile('/tmp/sa_tmpImg.XXXXXXXX');    

    # never bigger than config max size or smaller than 6kB
    if ($imagelen < 6) 
    {
        debuglog($dbg, "skip, image size = $imagelen kB");
        unlink($fh_a, $tmpImg);
        return "";
    }
    if ($imagelen > $maxsize)
    {
        debuglog($dbg, "skip, image size > $maxsize kB");
        unlink($fh_a, $tmpImg);
        return "";
    }   

    open (FILE, ">$tmpImg") or return "";
    print FILE "$imagestream\n";
    close FILE;

    my $imageIdentifyTxt = "";
    
        # check WxH, type, layers
        open  EXEFH, "identify -quiet -format '%wx%h %m %n' $tmpImg 2>/dev/null |";
        $imageIdentifyTxt = join "", <EXEFH>;
        close EXEFH;
        
        if( $imageIdentifyTxt =~ /^(\d+)x(\d+) (GIF|PNG|JPEG).* ([1-9])$/ )
        {
            my $width = $1;
            my $height = $2;
            my $magic = $3;
            my $layercnt = $4;
            
            debuglog($dbg, "img info: $width x $height $magic, $layercnt layer(s), $imagelen kB");
            if($width * $height  > 1024*1024 && ($width > 1024 or $height > 1024) )
            {
                debuglog($dbg, "skip, image dimension = $width x $height");
                unlink($fh_a, $tmpImg);
                return "";
            }
        }
        else
        {
            debuglog($dbg, "identify: skip, wrong type, image broken or too many layers");
            unlink($fh_a, $tmpImg);
            return "";
        }
    # -append  :: concatenate image
    # -flatten :: fuse layers
    # -depth :: 8-bit img
    # -background white :: convert tranparency
    my $convertOpts = "-append -flatten -depth 8 -background white";
    my $textOut = "";
    my $ocrProfile = "";
    my $ocradOpts = "";
    my ($fh_b, $tmpTxt) = tempfile('/tmp/sa_tmpTxt.XXXXXXXX');
    
        if ($conf->{ocr_profile} =~ /^[1-3]$/) 
        {
         $ocrProfile = $conf->{ocr_profile};
        }
        else 
        { 
         $ocrProfile = 1;
        }
        SWITCH:
        {
         if ($ocrProfile == 1) {$ocradOpts = "-s 2 -F utf8 -e letters_only";}
         if ($ocrProfile == 2) {$ocradOpts = "-s 3 -i -F utf8 -e letters_only";}
         if ($ocrProfile == 3) {$ocradOpts = "-s 3 -T 38% -F utf8 -e letters_only";}
        }
  # OCR call with timeout
  my $child;
  my $error = undef;
  my ($fh_c, $sh_exit) = tempfile('/tmp/sa_shExit.XXXXXXXX');

  eval {
      local $SIG{ALRM} = sub { die 'alarm' };
      $child = fork;
      die 'fork' unless(defined $child);
      if($child){
        alarm 10;
        wait;
        alarm 0;
      }
      else
      {
        exec("convert $convertOpts $tmpImg pnm:- 2>> $sh_exit|ocrad $ocradOpts >$tmpTxt 2>> $sh_exit");
        exit;
      }
  };
  # get shell errors (exec), if any
  if ( -s $sh_exit )
  {
  open SHXT, "< $sh_exit";
  my $shexit = join "",<SHXT>;
  close SHXT;
  debuglog($dbg, "shell errors: \n$shexit");
  }
  
  # get timeout or fork errors, kill if needed
  if($error = $@)
  {
      if( $error =~ /alarm/)
      {
      alarm(0);
      kill 9, $child;
      debuglog($dbg, "convert/ocrad timeout: kill pid $child");
      }
      elsif($error =~ /fork/)
      {
      debuglog($dbg, "Unable to fork: $error");
      }
      else
      {
      die "Error: $error";
      }
  }
  else
  {
      open OCRTXT, "< $tmpTxt";
      $textOut = join "", <OCRTXT>;
      $textOut =~ s/\s+$//mg;
      close OCRTXT;
  }        
    unlink($fh_a, $tmpImg);
    unlink($fh_b, $tmpTxt);
    unlink($fh_c, $sh_exit);

    return $textOut;
 
}

sub debuglog()
# write to a logfile
{
  my ($dbg, $msg) = @_;   

  if ($dbg eq "on")
    {
    my $timenow = localtime time;
    open (FILE, ">>/tmp/sa_OCR.log");
    print FILE "$timenow -- $msg\n";
    close FILE;
    }
}
