[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: How to display only partial message body
#!/usr/bin/perl -w
use strict;
my ($From, %headers, $body);
my $quiet = 1;
# 1. input: a mail message on STDIN
# stashes the three parts of the message into global vars: $From, %headers, $body
&get_input;
# 2. PROCESS / MUNGE
&process_body;
# 3. output: a fixed-up mail message to STDOUT
# also writes file at 'NOTEDIR/msgid' , this is used by mhonarc to produce META DESCRIPTION
print &whole_msg;
&generate_note;
#
# ---------- SUBS ----------------
#
sub process_body {
# unused at this point - obviated by the later line:
# $body =~ s/\n\s\s+(\S)/\n\n$1/mg;
# my $SO_MANY_LINES_INDICATES_NO_LINE_BREAKS = 25;
# INPUT - global variable $body
# OUTPUT - global variable $body
# We do 3 things to cleanup the body of the message:
# 1. We change patterns in the text on a line by line basis, ensuring logical
# chunks get seperated by newlines and translating the machine markup like '=20' and '='
# into newlines or spaces, as appropriate by the context.
# 2. We look at chunks (paragraphs), as seperated by blank lines.
# Each paragraph is inspected, and it is either 'flow-wrapped' or 'preserve-line-breaks'
# 3. We do final cleanup.
# 1. LINE BY LINE fixing
# some emails have =20 on its own line, this is a seperator, replace with \n\n
$body =~ s/\n=20\n/\n\n/mg;
# some emails have =20 and then a blank line, this is a sep, replace with \n\n
$body =~ s/=20\n\n/\n\n/mg;
# some emails have =20 and then a line with text.
# replace this with a space to join lines into paragr.
$body =~ s/=20\n(\w)/ $1/mg;
# some emails have =20 abutting right next to text - just delete the =20
$body =~ s/(\w)=20\n/$1\n/mg;
# some emails have '=' at the end of the line, and then a \n. join this to the next line
$body =~ s/=\n/ /mg;
# ensure lines that are ALL UPPERCASE are seperated from others
$body =~ s/(\n[[:upper:]]+\n)/\n$1/mg;
# ensure lines that are all '--------' are seperated from others
$body =~ s/(\n(-+)\n)/\n$1\n/mg;
# replace three or more blank lines with a two blank lines - we don't need more than 2.
$body =~ s/\n(\n)+/\n\n/mg;
# ensure indented paragraphs get are seperated from the paragraph above them, with a newline
$body =~ s/\n\s\s+(\S)/\n\n$1/mg;
# 2. PARAGRAPH BY PARAGRAPH FIXING
my @paras = split(/(\n\n)/, $body);
my $para;
$body = '';
my $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL = 46;
PARAGRAPH: foreach $para (@paras) {
$para =~ s/^\n//;
# $para =~ s/\n$//;
my @lines_in_para = split(/\n/, $para);
my $num_lines = 1;
if ($#lines_in_para) {
$num_lines = 1 + $#lines_in_para;
}
my $firstline = $lines_in_para[0];
$firstline = '' unless $firstline;
my $length_of_first_line = length( $firstline );
my $secondline = '';
if ($num_lines > 1) {
$secondline = $lines_in_para[1];
}
;
my $length_of_second_line = length( $secondline );
d ("---","---");
d ("num_lines", $num_lines);
d ("firstline", $firstline);
d ("length of first line", $length_of_first_line);
d ("secondline", $secondline);
d ("length of second line", $length_of_second_line);
# CASE 1: paragraph is just a blank line -> preserve break
if ($length_of_first_line < 1) {
d("CASE", "empty");
d("para", $para);
$body .= "\n";
next PARAGRAPH
}
# CASE 2: first line and second line in paragraph is quite short,
# preserve formatting by adding line break after each line
elsif (
($length_of_first_line < $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL)
and
($length_of_second_line < $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL)
) {
d("CASE ", "blockquote para");
for (@lines_in_para) {
$body .= '>' . $_ . "\n";
}
next PARAGRAPH;
}
# CASE 3: first line but not second line in paragraph is quite short,
# preserve formatting of first line, seperate the rest.
elsif (
($length_of_first_line < $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL)
and
($length_of_second_line >= $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL)
) {
d("CASE", "title line, then regular para");
my $i = 1;
$body .= $firstline . "\n\n";
my ($j) = $num_lines - 1;
for ($i .. $j) {
$body .= $lines_in_para[$_] . "\n" ;
}
;
next PARAGRAPH;
} else {
d("CASE", "exact");
$body .= $para . "\n\n";
next PARAGRAPH;
}
}
# 3. FINAL CLEANUP
$body =~ s/^From/ From/mg;
$body =~ s/\n(\n)+/\n\n/mg;
$body =~ s/\>\n/\n/mg;
}
sub generate_note {
# the only configuration you _need_ is to change $myhome
my $myhome = '/home/wfnorg';
my $sep = '/';
my $notedir = $myhome . $sep . 'notes';
-d $notedir or die "please create $notedir";
my ($overwrite); # will overwrite any existing notes if 1
my ($quiet); # will be quiet about warning if 1
$quiet = 0;
my (%note_fields);
# find the Message-ID and stash in the the %note_fields array
while (my ($key,$value) = each %headers) {
$note_fields{ message } = $value if $key =~ /^message-id/i;
$note_fields{ msg } = $value if $key =~ /^msg-id/i;
$note_fields{ content } = $value if $key =~ /^content-id/i;
}
# -------------- C) msgid cleanup
my ($msgid);
$msgid = $note_fields{message} || $note_fields{msg} || $note_fields{content};
if (defined($msgid)) {
if ($msgid =~ /<([^>]*)>/) {
$msgid = $1;
} else {
$msgid =~ s/^\s+//;
$msgid =~ s/\s+$//;
}
} else {
# create bogus ID if none exists
eval {
# create message-id using md5 digest of header;
# can potentially skip over already archived messages w/o id
require Digest::MD5;
$msgid = join("", Digest::MD5::md5_hex(join '', values %headers),
'@NO-ID-FOUND.mhonarc.org');
};
if ($@) {
# unable to require, so create arbitary message-id
$msgid = join("", $$, '.', time, '.', $_,
'@NO-ID-FOUND.mhonarc.org');
}
$headers{ 'Message-ID' } = $msgid;
}
my $note = '';
my @paras = split(/(\n\n)/, $body);
foreach my $para (@paras) {
$_ = $para;
next if ( length($note) gt 300 );
# grab following paragraph if we have one short but good paragraph.
next unless ( $note or (! /^[^ ]: / and /(\.|\?)"?\s*$/ ));
$note .= $_;
$note =~ s/---+[^-]*---+//g;
$note =~ s (\<|\>|'|") ()g;
}
# -------------- E) write $notedir/$msgid
my $notefile = $notedir . $sep . msgid_to_filename($msgid);
# sanity check
if (-e $notefile and ! $overwrite ) {
print STDERR "$notefile ... exists\n" unless $quiet;
exit;
} else {
open (NOTE, ">" . $notefile) or die "could not create $notefile";
print NOTE substr ( $note, 0,600);
close (NOTE) or die "could not create $notefile";
}
}
;
sub get_input {
my ($inHeader, $cur_header, $cur_header_value);
$inHeader = 1;
while (<>) {
if ($inHeader) {
chomp;
# Case 1 - the initial line, like
# From wfn-editors-bounces@xxxxxxx Wed Mar 1 12:40:06 PST 2006
if (/^From\s/) {
$From = $_;
}
# Case 2 - a key-value pair of the header, like
# From: <NEWS@xxxxxxxx>
elsif (/^\S+:/) {
&imprint_header($cur_header, $cur_header_value);;
($cur_header, $cur_header_value) = split (/: /, $_, 2);
}
# Case 3 - a 'flow-over' line in the header, like a long Subject: line.
elsif (/\s+\S/) {
chomp;
s/^\s+/ /g;
$cur_header_value .= $_;
}
# Case 4 - blank line - the end of the header
if (/^$/) {
&imprint_header($cur_header, $cur_header_value);;
$inHeader = 0;
}
} # inHeader
else {
$body .= $_;
}
;
}
}
# ----------------------------------------------------------------
# UTILITY SUBS
#
# combines the three parts of the message ($From, %headers, $body) into a string.
sub whole_msg {
my ($whole_msg);
$whole_msg = "$From\n";
while (my ($key,$value) = each %headers) {
$whole_msg .= "$key: $value\n";
}
$whole_msg .= "\n" . $body;
return $whole_msg;
}
sub imprint_header ($$) {
my ($key, $value) = @_;
if ($key) {
$headers{$key} .= $value;
}
}
sub msgid_to_filename {
my $msgid = shift;
$msgid =~ s/([^\w.\-\@])/sprintf("=%02X",unpack("C",$1))/geo;
$msgid;
}
sub d{
my ($label, $value) = @_;
return if $quiet;
print STDERR "$label: $value\n";
}
__END__
[Index of Archives]
[Bugtraq]
[Yosemite News]
[Mhonarc Home]