Re: How to display only partial message body

Michael de Beer <michael@xxxxxxxxxx> · Wed, 8 Mar 2006 12:32:27 -0500

#!/usr/bin/perl
# this is last30-xml.pl
#------------------------------------

BEGIN {push @INC, '/home/wfnorg/lib/perl5/i386-linux', '/home/wfnorg/bin';}
my $www = '/home/wfnorg/www';

# This script produces 8 seperate RSS feeds.
# It does an mhonarc -scan , and then uses some if-then logic to build lists of articles
# that should go into each feed.

$page_pcusa = "$www/rss/pcusa-rss.xml";
$page_wfn = "$www/rss/wfn-rss.xml";
$page_ens = "$www/rss/ens-rss.xml";
$page_elca = "$www/rss/elca-rss.xml";
$page_nccc = "$www/rss/nccc-rss.xml";
$page_lwf = "$www/rss/lwf-rss.xml";
$page_ucc = "$www/rss/ucc-rss.xml";
$page_wcc = "$www/rss/wcc-rss.xml";
$page_lcms = "$www/rss/lcms-rss.xml";

# ----------- ENVIRONMENT -------------
use lib '/home/wfnorg/lib/perl5/site_perl/5.8.4';
#        '/home/wfnorg/lib/perl5/lib/site_perl/5.005';
use Unicode::String;
require HTML::HeadParser;
use Date::Manip;
use HTTP::Headers;

# I don't think i use MongerFile anymore, but if it does not work, uncomment this line.
#use MongerFile;

my $www = '/home/wfnorg/www';
my $mh = '/home/wfnorg/mh/bin/mhonarc';
$base_url = "http://www.wfn.org";;
$i=20;

$wfn_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>WorldFaith News - www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>Official news releases of national and world faith groups</description>
     <language>en-us</language>
		 ];

my $pcusa_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>PCUSA News @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>Official news releases of PCUSA</description>
     <language>en-us</language>
		 ];

$rss_bottom = qq[

</channel>
</rss>

	  ];

my $ens_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>ENS News @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>Episcopal Church in the USA - News Service</description>
     <language>en-us</language>
		 ];

my $elca_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>ELCA News @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>Evangelical Lutheran Church in America News Releases</description>
     <language>en-us</language>
		 ];

my $nccc_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>NCC USA News @ wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>National Council of the Churches USA</description>
     <language>en-us</language>
		 ];

my $lwf_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>LWF News @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>Lutheran World Federation</description>
     <language>en-us</language>
		 ];

my $ucc_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>UCC News @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>United Church of Christ - News Releases and Special Reports</description>
     <language>en-us</language>
		 ];

my $wcc_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>World Council of Churches @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>World Council of Churches official news releases</description>
     <language>en-us</language>
		 ];

my $lcms_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">

   <channel>

     <title>LCMS News @ www.wfn.org</title>
     <link>http://www.wfn.org/</link>
     <description>LCMS news releases</description>
     <language>en-us</language>
		 ];

# ----------- DATE VARIABLES ----------
sub cleanup
{    my ($in) = shift;
    $in =~ s/&/+/g;
  $in =~ s/"//g;
#  $in =~ s///g;
  $in =~ s/</[/g;
  $in =~ s/>/]/g;
#  return $in;
  my $u = Unicode::String->new($in);
  return $u->utf8;
}

sub strip_whitespace {
    my (@refs) = @_;
    foreach $ref (@refs) {
	$$ref =~ s/^\s+//g;
	$$ref =~ s/\s+$//g;
    }
} 

sub doit {
my $h = HTTP::Headers->new;
my ($from, $file, $date, $url) = @_;
#-----------------------------------
#--- parse html file, and extract TITLE, DESCRIPTION
my $p = HTML::HeadParser->new($h);
open (FILE, $file) or die "could not open $filename";
$head = '';
while (<FILE>) { 
    $head .= $_;
    last if /^<\/HEAD>/;
}
close (FILE) or die "could not close $file";
$p->parse($head) and  print "not finished";

#my $FF = new MongerFile($file);
#$date =	$FF->created_gmt;
my $created = $^T - int((-M "$file") * 86400);
$created = $created - (7 * 60 * 60);
my $date = gmtime($created);
my $Title = $p->header('Title');
#print "title: $Title\n";
$Title = substr($Title, 10); 
#print "subst: $Title\n";
#print $p->header('X-meta-description');
$description = $p->header('X-meta-description');

#-----------------------------------
# prepare for printing
strip_whitespace (\$from);
#print "A $Title \n";    
$Title = &cleanup($Title);
#print "t $Title \n";    
$f = &cleanup($from);
$d = &cleanup($description);
$date=&UnixDate($date,'%g');
#print "DATE: ", $date, "\n";
undef $h;
undef $FF;
return qq[

		    <item>
		    <link>$url</link>
		    <title>$Title ($f)</title>
                    <pubDate>$date</pubDate>
		    <description>$d</description>
      		    </item>

		    ];
}

my $err;
my $todayM = &ParseDate("today"); # date format is YYYYMMDDHH:MM:SS.
my ($today_year, $today_month) = unpack ("A4 A2", $todayM);
#print "YYYYMMDDHH:MM:SS\n";  #print "$todayM\n"; #print "$today_year : $today_month\n";
my $start = &DateCalc("today","- 28days",\$err);
my ($start_year, $start_month) = unpack ("A4 A2", $start);
#print "$start\n"; #print "$start_year : $start_month\n";

# ----------- DO THIS -----------------

open W, ">$page_wfn" or die;
print W $wfn_top;

open P, ">$page_pcusa" or die;
print P $pcusa_top;

open ens, ">$page_ens" or die;
print ens $ens_top;
open elca, ">$page_elca" or die;
print elca $elca_top;
open nccc, ">$page_nccc" or die;
print nccc $nccc_top;
open lwf, ">$page_lwf" or die;
print lwf $lwf_top;
open ucc, ">$page_ucc" or die;
print ucc $ucc_top;
open wcc, ">$page_wcc" or die;
print wcc $wcc_top;
open lcms, ">$page_lcms" or die;
print lcms $lcms_top;

&grab_scan($today_year, $today_month);
&grab_scan($start_year, $start_month, $start);

print ens $rss_bottom; close ens;
print elca $rss_bottom; close elca;
print nccc $rss_bottom; close nccc;
print lwf $rss_bottom; close lwf;
print ucc $rss_bottom; close ucc;
print wcc $rss_bottom; close wcc;
print lcms $rss_bottom; close lcms;

print W $rss_bottom;
close W;

print P $rss_bottom;
close P;

sub grab_scan($$;$){
  local ($YYYY, $MM, $filter_startdate) = @_;
  $outdir = "$www/$YYYY/$MM";
  local ($start_filtering) = '';
  $myCommand = "$mh -quiet -nolock -reverse -scan -outdir $outdir |";
#  print $myCommand;
  open (SCAN, $myCommand) or 
      die "could not scan $outdir $!";
  while (<SCAN>){
#     print;
      next unless /^\s*\d+\s+\d+/;
      next if $start_filtering; # save computer a bit of time
      $scan_template = 'A5 A12 A32 A92';
      my ($msgid, $date, $from, $subject) = unpack($scan_template, $_);

      # skip if filter_startdate is earlier than $item_date
      if ($filter_startdate){
         $item_date = &ParseDate($date);
         #print "$filter_startdate\n$item_date\n";
         $start_filtering++ if &Date_Cmp($filter_startdate, $item_date) eq 1;
         next if $start_filtering;
      }
      # mhonarc -scan has buggy titles for diacriticals, so here is one

      $lmsgid = sprintf( "%05d", $msgid ) ;
      my $file="$www/$YYYY/$MM/msg$lmsgid.html";
      my $url = "$base_url/$YYYY/$MM/msg$lmsgid.html";

#      print "a$subject\n";
      my $test = $subject;
#      if (($i > 1) or ($test =~ m/PCUSANEWS|ENS|ELCA|NCC|LWF|UCC|WCC|NCCC/)){
#          print "b$subject\n";
	  $text = &doit ($from, $file, $date, $url);
#          print "c$subject\n";

      if ($i-- > 1) {
          print W $text;
      };

      if ($subject =~ /PCUSANEWS/) {
	  print P $text;
      };

      if ($test =~ /ENS/){
          print ens $text;
      };

      if ($from =~ /ELCA\.ORG/){
          print elca $text;
      };

      if ( ($from =~ /ncc/) || ($subject =~ /NCC/)) {
          print nccc $text;
      };

      if ($subject =~ /LWF/){
          print lwf $text;
      };

      if ($from =~ /ucc\.org/){
          print ucc $text;
      };

      if ($from =~ /wcc-coe\.org/){
          print wcc $text;
      };

      if ($subject =~ /LCMSNews/){
          print lcms $text;
      };
      print "WORKING $msgid\n";
#  } else { 
# print "$msgid\n"; 
#}
  }
  close (SCAN) or die "could not scan $outdir - error closing handle";
}

__END__