[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: How to display only partial message body
#!/usr/bin/perl
# this is last30-xml.pl
#------------------------------------
BEGIN {push @INC, '/home/wfnorg/lib/perl5/i386-linux', '/home/wfnorg/bin';}
my $www = '/home/wfnorg/www';
# This script produces 8 seperate RSS feeds.
# It does an mhonarc -scan , and then uses some if-then logic to build lists of articles
# that should go into each feed.
$page_pcusa = "$www/rss/pcusa-rss.xml";
$page_wfn = "$www/rss/wfn-rss.xml";
$page_ens = "$www/rss/ens-rss.xml";
$page_elca = "$www/rss/elca-rss.xml";
$page_nccc = "$www/rss/nccc-rss.xml";
$page_lwf = "$www/rss/lwf-rss.xml";
$page_ucc = "$www/rss/ucc-rss.xml";
$page_wcc = "$www/rss/wcc-rss.xml";
$page_lcms = "$www/rss/lcms-rss.xml";
# ----------- ENVIRONMENT -------------
use lib '/home/wfnorg/lib/perl5/site_perl/5.8.4';
# '/home/wfnorg/lib/perl5/lib/site_perl/5.005';
use Unicode::String;
require HTML::HeadParser;
use Date::Manip;
use HTTP::Headers;
# I don't think i use MongerFile anymore, but if it does not work, uncomment this line.
#use MongerFile;
my $www = '/home/wfnorg/www';
my $mh = '/home/wfnorg/mh/bin/mhonarc';
$base_url = "http://www.wfn.org";
$i=20;
$wfn_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>WorldFaith News - www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>Official news releases of national and world faith groups</description>
<language>en-us</language>
];
my $pcusa_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>PCUSA News @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>Official news releases of PCUSA</description>
<language>en-us</language>
];
$rss_bottom = qq[
</channel>
</rss>
];
my $ens_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>ENS News @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>Episcopal Church in the USA - News Service</description>
<language>en-us</language>
];
my $elca_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>ELCA News @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>Evangelical Lutheran Church in America News Releases</description>
<language>en-us</language>
];
my $nccc_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>NCC USA News @ wfn.org</title>
<link>http://www.wfn.org/</link>
<description>National Council of the Churches USA</description>
<language>en-us</language>
];
my $lwf_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>LWF News @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>Lutheran World Federation</description>
<language>en-us</language>
];
my $ucc_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>UCC News @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>United Church of Christ - News Releases and Special Reports</description>
<language>en-us</language>
];
my $wcc_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>World Council of Churches @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>World Council of Churches official news releases</description>
<language>en-us</language>
];
my $lcms_top = qq[<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0">
<channel>
<title>LCMS News @ www.wfn.org</title>
<link>http://www.wfn.org/</link>
<description>LCMS news releases</description>
<language>en-us</language>
];
# ----------- DATE VARIABLES ----------
sub cleanup
{ my ($in) = shift;
$in =~ s/&/+/g;
$in =~ s/"//g;
# $in =~ s///g;
$in =~ s/</[/g;
$in =~ s/>/]/g;
# return $in;
my $u = Unicode::String->new($in);
return $u->utf8;
}
sub strip_whitespace {
my (@refs) = @_;
foreach $ref (@refs) {
$$ref =~ s/^\s+//g;
$$ref =~ s/\s+$//g;
}
}
sub doit {
my $h = HTTP::Headers->new;
my ($from, $file, $date, $url) = @_;
#-----------------------------------
#--- parse html file, and extract TITLE, DESCRIPTION
my $p = HTML::HeadParser->new($h);
open (FILE, $file) or die "could not open $filename";
$head = '';
while (<FILE>) {
$head .= $_;
last if /^<\/HEAD>/;
}
close (FILE) or die "could not close $file";
$p->parse($head) and print "not finished";
#my $FF = new MongerFile($file);
#$date = $FF->created_gmt;
my $created = $^T - int((-M "$file") * 86400);
$created = $created - (7 * 60 * 60);
my $date = gmtime($created);
my $Title = $p->header('Title');
#print "title: $Title\n";
$Title = substr($Title, 10);
#print "subst: $Title\n";
#print $p->header('X-meta-description');
$description = $p->header('X-meta-description');
#-----------------------------------
# prepare for printing
strip_whitespace (\$from);
#print "A $Title \n";
$Title = &cleanup($Title);
#print "t $Title \n";
$f = &cleanup($from);
$d = &cleanup($description);
$date=&UnixDate($date,'%g');
#print "DATE: ", $date, "\n";
undef $h;
undef $FF;
return qq[
<item>
<link>$url</link>
<title>$Title ($f)</title>
<pubDate>$date</pubDate>
<description>$d</description>
</item>
];
}
my $err;
my $todayM = &ParseDate("today"); # date format is YYYYMMDDHH:MM:SS.
my ($today_year, $today_month) = unpack ("A4 A2", $todayM);
#print "YYYYMMDDHH:MM:SS\n"; #print "$todayM\n"; #print "$today_year : $today_month\n";
my $start = &DateCalc("today","- 28days",\$err);
my ($start_year, $start_month) = unpack ("A4 A2", $start);
#print "$start\n"; #print "$start_year : $start_month\n";
# ----------- DO THIS -----------------
open W, ">$page_wfn" or die;
print W $wfn_top;
open P, ">$page_pcusa" or die;
print P $pcusa_top;
open ens, ">$page_ens" or die;
print ens $ens_top;
open elca, ">$page_elca" or die;
print elca $elca_top;
open nccc, ">$page_nccc" or die;
print nccc $nccc_top;
open lwf, ">$page_lwf" or die;
print lwf $lwf_top;
open ucc, ">$page_ucc" or die;
print ucc $ucc_top;
open wcc, ">$page_wcc" or die;
print wcc $wcc_top;
open lcms, ">$page_lcms" or die;
print lcms $lcms_top;
&grab_scan($today_year, $today_month);
&grab_scan($start_year, $start_month, $start);
print ens $rss_bottom; close ens;
print elca $rss_bottom; close elca;
print nccc $rss_bottom; close nccc;
print lwf $rss_bottom; close lwf;
print ucc $rss_bottom; close ucc;
print wcc $rss_bottom; close wcc;
print lcms $rss_bottom; close lcms;
print W $rss_bottom;
close W;
print P $rss_bottom;
close P;
sub grab_scan($$;$){
local ($YYYY, $MM, $filter_startdate) = @_;
$outdir = "$www/$YYYY/$MM";
local ($start_filtering) = '';
$myCommand = "$mh -quiet -nolock -reverse -scan -outdir $outdir |";
# print $myCommand;
open (SCAN, $myCommand) or
die "could not scan $outdir $!";
while (<SCAN>){
# print;
next unless /^\s*\d+\s+\d+/;
next if $start_filtering; # save computer a bit of time
$scan_template = 'A5 A12 A32 A92';
my ($msgid, $date, $from, $subject) = unpack($scan_template, $_);
# skip if filter_startdate is earlier than $item_date
if ($filter_startdate){
$item_date = &ParseDate($date);
#print "$filter_startdate\n$item_date\n";
$start_filtering++ if &Date_Cmp($filter_startdate, $item_date) eq 1;
next if $start_filtering;
}
# mhonarc -scan has buggy titles for diacriticals, so here is one
$lmsgid = sprintf( "%05d", $msgid ) ;
my $file="$www/$YYYY/$MM/msg$lmsgid.html";
my $url = "$base_url/$YYYY/$MM/msg$lmsgid.html";
# print "a$subject\n";
my $test = $subject;
# if (($i > 1) or ($test =~ m/PCUSANEWS|ENS|ELCA|NCC|LWF|UCC|WCC|NCCC/)){
# print "b$subject\n";
$text = &doit ($from, $file, $date, $url);
# print "c$subject\n";
if ($i-- > 1) {
print W $text;
};
if ($subject =~ /PCUSANEWS/) {
print P $text;
};
if ($test =~ /ENS/){
print ens $text;
};
if ($from =~ /ELCA\.ORG/){
print elca $text;
};
if ( ($from =~ /ncc/) || ($subject =~ /NCC/)) {
print nccc $text;
};
if ($subject =~ /LWF/){
print lwf $text;
};
if ($from =~ /ucc\.org/){
print ucc $text;
};
if ($from =~ /wcc-coe\.org/){
print wcc $text;
};
if ($subject =~ /LCMSNews/){
print lcms $text;
};
print "WORKING $msgid\n";
# } else {
# print "$msgid\n";
#}
}
close (SCAN) or die "could not scan $outdir - error closing handle";
}
__END__
[Index of Archives]
[Bugtraq]
[Yosemite News]
[Mhonarc Home]