# $Id: PunctuationFrequency.pm 678 2007-08-20 07:59:24Z aqua $

package Mail::SpamAssassin::Plugin::PunctuationFrequency;

=head1 NAME

Mail::SpamAssassin::Plugin::PunctuationFrequency - Test mail for abnormal punctuation

=head1 SYNOPSIS

To enable in configuration:

 loadplugin Mail::SpamAssassin::Plugin::PunctuationFrequency

To perform punctuation tests:

 body     PUNCTUATION_BODY_50 eval:check_punctuated_word_frequency_body(50)
 describe PUNCTUATION_BODY_50 At least 50% of words in body contain punctuation

 body     PUNCTUATION2_BODY_10 eval:check_punctuated_word_frequency_body(10,2,1)
 describe PUNCTUATION2_BODY_10 At least 10% of words in body contain multiple interior punctuation characters

header   PUNCTUATION2_SUBJECT_10 eval:check_punctuated_word_frequency_subject(10,2)
describe PUNCTUATION2_SUBJECT_10 At least 10% of words in subject contain multiple punctuation characters

=head1 DESCRIPTION

This plugin performs tests on punctuation found in words found within email,
for use in creating scoring rules based on abnormalities in distribution of
punctuation.  The main intention is to catch punctuation-obfuscated text
as used by spammers in an attempt to reduce Bayesian test effectiveness and to
avoid simplified word-based pattern matching.

This plugin differs from the two conventional techniques used to detect this
sort of obfuscation: negative-lookahead assertion patterns based on a known
vocabulary, and vocabulary-less patterns based on the presence of tokens with
excessive punctuation symbols found therein.  Both of these approaches are
worthwhile, but each have a singificant limit.  Tests for mangled forms of
known words, while they yield high-confidence matches, require knowing the
spammer's vocabulary in advance, and become gradually obsolete as the products
hawked by spam evolve.  Pattern-based matching of words with intermixed
punctuation can test at most for the presense of one or a few excessively
punctuated words, and cannot describe the entire content of the message;
consequently, they cannot be used to reflect the properties of the mail as a
while.  Simple assertive pattern-based obfuscation detection is also highly
suceptible to false positives, and hence can be given only modest weighting.

This plugin complements those two approaches by enabling tests based on the
frequency of words containing chosen quantities of punctuation, along with
common-sense exclusions not readily feasible in simple pattern matches.

The frequency count is done on the "rendered text" form of the mail, and hence
does not examine HTML structure or non-text MIME sections.  URIs identifiable
as such by SpamAssassin are not counted.

=head1 EVAL TESTS

Two 'eval' tests are available for use in rules, each following the
same basic pattern.  Three arguments common to each are as follows:

=over 4

=item I<percentage>

The percentage of tokens in the mail that must contain punctuation for the
rule to match.  Given as an integer, e.g. "10" signifying 10%.

=item I<minimum_chars>

The minimum number of punctuation characters that must be found in a token
for it to count towards I<percentage>.  The default value is B<1>.

=item I<interior_only>

If a true value is given, only punctuation characters found on the interior of
a word, such as "CI*AL!S" will be considered -- that is, those found on the
beginning or end, such as "$foo," will be ignored (without affecting whether
the word will still count towards I<percentage> if it also contains enough
punctuation on the interior.)

Regardless of this setting, the English posessive forms "'s" and "s'" will be
ignored if found at the end of a word.

=back

=over 4

=item check_punctuated_word_frequency_body(percentage, [minimum_chars, [interior_only]])

Returns true if I<percentage> of the non-URI tokens of the mail contain
at least I<minimum_chars> punctuation characters.  If <interior_only> is
non-zero, punctuation characters are only counted if they appeared on the
interior of the word, as opposed to either end.

=item check_punctuated_word_frequency_subject(percentage, [minimum_chars, [interior_only]])

Checks for punctuation within the message's Subject header, if any.

=back

=head1 AUTHOR, COPYRIGHT

Mail::SpamAssassin::Plugin::PunctuationFrequency is copyright (c) 2007 by Devin
Carraway E<lt>spamassassin&devin.comE<gt>.  It may be used and redistributed
under the same terms as SpamAssassin itself, to wit:

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at L<http://www.apache.org/licenses/LICENSE-2.0>

=cut

use strict;
use warnings;
use bytes;

use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Logger qw(log_message dbg info);

use base qw(Mail::SpamAssassin::Plugin);

sub new {
    my $class = shift;
    my $sa = shift;

    my $self = $class->SUPER::new($sa) || return undef;
    bless $self, $class;

    $self->register_eval_rule('check_punctuated_word_frequency_body');
    $self->register_eval_rule('check_punctuated_word_frequency_subject');
}

sub check_punctuated_word_frequency_body {
    my $self = shift;
    my $pms = shift;
    my $bodyarr = shift;
    my $percentage = shift;
    my $punct_min = shift || 1;
    my $interior_only = shift || 0;

    my $cn = "body $punct_min $interior_only";
    if (exists $pms->{wordmangling_punct}->{$cn}) {
        return $pms->{wordmangling_punct}->{$cn} >= ($percentage * 0.01);
    }

    my $body = $pms->get_decoded_stripped_body_text_array();
    my $pct = $self->punct_check_($pms, $body, $punct_min, $interior_only);
    $pms->{wordmangling_punct}->{$cn} = $pct;
    return $pct >= ($percentage * 0.01);
}

sub check_punctuated_word_frequency_subject {
    my $self = shift;
    $self->check_punctuated_word_frequency_header(@_, 'Subject')
}

sub check_punctuated_word_frequency_header {
    my $self = shift;
    my $pms = shift;
    my $percentage = shift;
    my $punct_min = shift;
    my $interior_only = shift;
    my $header = shift;

    unless ($header) {
        log_message('error', "No header selected, doing nothing");
        return undef;
    }
    unless ($punct_min) {
        log_message('error', "Must check for at least one punctuation character");
        return undef;
    }

    my $cn = "$header $punct_min $interior_only";
    if (exists $pms->{wordmangling_punct}->{$cn}) {
        return $pms->{wordmangling_punct}->{$cn} >= ($percentage * 0.01);
    }

    my $head = $pms->get($header) || return undef;
    chomp $head;
    my $pct = $self->punct_check_($pms, [$head], $punct_min, $interior_only);

    $pms->{wordmangling_punct}->{$cn} = $pct;
    return $pct >= ($percentage * 0.01);
}

sub punct_check_ {
    my $self = shift;
    my $pms = shift;
    my $content = shift || return undef;
    my $punct_min = shift || 1;
    my $interior_only = shift;
    
    $pms->{wordmangling_uri_lookup} ||= {
        map { ($_ => 1) } $pms->get_uri_list()
    };

    my ($total, $mangled) = (0, 0);
    for my $line (@{$content}) {
        my ($lt, $lm) = $self->punct_mangle_count_($pms,
                                                   $line,
                                                   $punct_min,
                                                   $interior_only);
        $total += $lt;
        $mangled += $lm;
    }

    unless ($total) {
      return 0;
    }

    dbg("punctuation: tokens=$total, $mangled with >= $punct_min punctuation chars, pct=".($mangled/$total));
    return ($mangled / $total);
}


sub punct_mangle_count_ {
    my $self = shift;
    my $pms = shift;
    my $line = shift || '';
    my $punct_min = shift || 1;
    my $interior_only = shift;

    my ($nonempty, $withpunct) = (0, 0);
    for my $token (split /\s+/, $line) {

        # ignore entirely nonalphabetic tokens
        next unless $token =~ tr/a-zA-Z//;

        next if exists $pms->{wordmangling_uri_lookup}->{$token};

        $nonempty++;

        # if we're only looking for punctiation inside tokens,
        # then strip it off before counting
        $token =~ s/^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$//g if $interior_only;

        # posessives don't count
        $token =~ s/^\w+(?:'[sS]|[sS]')$//;

        if ($token =~ tr/a-zA-Z0-9//c >= $punct_min) {
            # dbg("mangled token (>= $punct_min, $interior_only): $token");
            $withpunct++;
        }

    }
    return ($nonempty, $withpunct);
}


1;

# vi: set ts=4 sw=4 tw=78 et:
