[prev] [thread] [next] [lurker] [Date index for 2005/02/07]
Author: simon
Date: 2005-02-07 18:28:43 +0000 (Mon, 07 Feb 2005)
New Revision: 1835
Added:
trunk/Email-Store-HTML/
trunk/Email-Store-HTML/Build.PL
trunk/Email-Store-HTML/MANIFEST.SKIP
trunk/Email-Store-HTML/lib/
trunk/Email-Store-HTML/lib/Email/
trunk/Email-Store-HTML/lib/Email/Store/
trunk/Email-Store-HTML/lib/Email/Store/HTML.pm
trunk/Email-Store-HTML/svn-commit.tmp
trunk/Email-Store-HTML/t/
trunk/Email-Store-HTML/t/00compile.t
trunk/Email-Store-HTML/t/01basic.t
trunk/Email-Store-HTML/t/htmltest.mail
Log:
Initial import of HTML stripping plugin for Email::Store
Added: trunk/Email-Store-HTML/Build.PL
===================================================================
--- trunk/Email-Store-HTML/Build.PL 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/Build.PL 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,15 @@
+use strict;
+use Module::Build;
+Module::Build->new(
+ module_name => 'Email::Store::HTML',
+ license => 'perl',
+ requires => {
+ 'perl' => 5.006,
+ 'Email::Store' => 0,
+ },
+ build_requires => {
+ 'Test::More' => 0,
+ 'DBD::SQLite' => 0,
+ },
+ create_makefile_pl => 'traditional',
+)->create_build_script;
Added: trunk/Email-Store-HTML/MANIFEST.SKIP
===================================================================
--- trunk/Email-Store-HTML/MANIFEST.SKIP 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/MANIFEST.SKIP 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,20 @@
+CVS/.*
+\.svn/.*
+\.cvsignore$
+\.Inline/.*
+_Inline/.*
+\.bak$
+\.tar$
+\.tgz$
+\.tar\.gz$
+~$
+^mess/
+^tmp/
+^testdata/
+^blib/
+^Makefile$
+^Makefile\.[a-z]+$
+^Build$
+^pm_to_blib$
+^_build/.*
+~$
\ No newline at end of file
Added: trunk/Email-Store-HTML/lib/Email/Store/HTML.pm
===================================================================
--- trunk/Email-Store-HTML/lib/Email/Store/HTML.pm 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/lib/Email/Store/HTML.pm 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,125 @@
+package Email::Store::HTML;
+use base "Email::Store::DBI";
+use strict;
+use Email::Store::Mail;
+__PACKAGE__->table("html_body");
+__PACKAGE__->columns( All => qw[ id mail raw scrubbed as_text ] );
+__PACKAGE__->columns( Primary => qw/id/);
+Email::Store::HTML->has_a(mail => "Email::Store::Mail");
+__PACKAGE__->add_constructor(from_mail => 'mail = ?');
+
+
+
+use HTML::Scrubber;
+use HTML::FormatText::WithLinks;
+use vars qw($VERSION @allow @rules @default);
+
+$VERSION = "0.1";
+
+sub on_store_order { 2 }
+
+sub on_store {
+ my ($self, $mail) = @_;
+
+ # create the text formatter
+ my $f = HTML::FormatText::WithLinks->new(
+ before_link => '',
+ after_link => ' [ %l ]',
+ footnote => ''
+ );
+
+
+ # create the scrubber
+ my $scrubber = HTML::Scrubber->new(
+ allow => \@allow,
+ rules => \@rules,
+ default => \@default,
+ comment => 1,
+ process => 0,
+ );
+
+
+
+
+ for ($mail->attachments) {
+ next unless $_->content_type eq 'text/html';
+ my $raw = $_->payload;
+ my $scrubbed = $scrubber->scrub($raw);
+ my $text = $f->parse($raw);
+ Email::Store::HTML->create( { mail => $mail->id, raw => $raw, scrubbed => $scrubbed, as_text => $text } );
+ last;
+ }
+}
+
+###
+# Configuration for HTML::Scrubber
+###
+
+my @allow = qw[ br hr b a p pre ul ol li i em strong table tr td th div ];
+ #
+my @rules = (
+ script => 0,
+ img => {
+ border => 1,
+ alt => 1, # alt attribute allowed
+ '*' => 0, # deny all other attributes
+ },
+);
+ #
+my @default = (
+ 0 => # default rule, deny all tags
+ {
+ '*' => 1, # default rule, allow all attributes
+ 'href' => qr{^(?!(?:java)?script)}i,
+ 'src' => qr{^(?!(?:java)?script)}i,
+ 'cite' => '(?i-xsm:^(?!(?:java)?script))',
+ 'language' => 0,
+ 'name' => 1, # could be sneaky, but hey ;)
+ 'onblur' => 0,
+ 'onchange' => 0,
+ 'onclick' => 0,
+ 'ondblclick' => 0,
+ 'onerror' => 0,
+ 'onfocus' => 0,
+ 'onkeydown' => 0,
+ 'onkeypress' => 0,
+ 'onkeyup' => 0,
+ 'onload' => 0,
+ 'onmousedown' => 0,
+ 'onmousemove' => 0,
+ 'onmouseout' => 0,
+ 'onmouseover' => 0,
+ 'onmouseup' => 0,
+ 'onreset' => 0,
+ 'onselect' => 0,
+ 'onsubmit' => 0,
+ 'onunload' => 0,
+ 'src' => 0,
+ 'type' => 0,
+ }
+ );
+
+package Email::Store::Mail;
+sub html {
+ my ($self) = @_;
+ my ($html) = Email::Store::HTML->from_mail($self->message_id);
+ return $html;
+}
+
+package Email::Store::HTML;
+1;
+
+__DATA__
+
+CREATE TABLE IF NOT EXISTS html_body (
+ id integer NOT NULL auto_increment primary key,
+ mail varchar(255) NOT NULL,
+ raw text,
+ scrubbed text,
+ as_text text
+);
+
+
+
+
+
Added: trunk/Email-Store-HTML/svn-commit.tmp
===================================================================
--- trunk/Email-Store-HTML/svn-commit.tmp 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/svn-commit.tmp 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,4 @@
+Initial import of HTML stripping stuff
+--This line, and those below, will be ignored--
+
+A .
Added: trunk/Email-Store-HTML/t/00compile.t
===================================================================
--- trunk/Email-Store-HTML/t/00compile.t 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/t/00compile.t 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,3 @@
+# -*- cperl -*-
+use Test::More tests => 1;
+require_ok( "Email::Store::HTML" );
Added: trunk/Email-Store-HTML/t/01basic.t
===================================================================
--- trunk/Email-Store-HTML/t/01basic.t 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/t/01basic.t 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,27 @@
+use Test::More tests => 13;
+use File::Slurp;
+BEGIN { unlink("t/test.db"); }
+use Email::Store "dbi:SQLite:dbname=t/test.db";
+Email::Store->setup( );
+ok(1, "Set up");
+
+my $data = read_file("t/htmltest.mail");
+Email::Store::Mail->store($data);
+my ($m) = Email::Store::Mail->retrieve_all(); #('myfakeid@localhost');
+ok($m, "Got the mail back");
+
+
+my ($html, $body, $raw, $scrubbed, $as_text);
+ok($html = $m->html, "Got html");
+ok($body = $m->simple->body, "Got body");
+ok($raw = $html->raw, "Got raw");
+ok($scrubbed = $html->scrubbed, "Got scrubbed");
+ok($as_text = $html->as_text, "Got text");
+
+unlike($body, qr/</, "No html in body");
+like($raw, qr/</, "Got html in raw");
+like($raw, qr/javascript/, "Got javascript in raw");
+unlike($scrubbed, qr/javascript/, "No html in body");
+unlike($as_text, qr/</, "No html in text");
+like($as_text, qr!this [ http://buscador.thegestalt.org ]!, "Text has link and sentence");
+
Added: trunk/Email-Store-HTML/t/htmltest.mail
===================================================================
--- trunk/Email-Store-HTML/t/htmltest.mail 2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/t/htmltest.mail 2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,36 @@
+Content-Transfer-Encoding: binary
+Content-Type: multipart/mixed; boundary="_----------=_110780049136980"
+MIME-Version: 1.0
+X-Mailer: MIME::Lite 3.01 (F2.71; T1.13; A1.58; B3.05; Q3.03)
+Date: Mon, 7 Feb 2005 18:21:31 UT
+From: someone@xxxxxxx.xxx
+To: simon@xxxxxxx.xxx
+Subject: Mixed part message
+
+This is a multi-part message in MIME format.
+
+--_----------=_110780049136980
+Content-Disposition: inline
+Content-Length: 65
+Content-Transfer-Encoding: binary
+Content-Type: text/plain
+
+This (http://buscador.thegestalt.org) should come out in the text
+--_----------=_110780049136980
+Content-Disposition: attachment; filename="msg.html"
+Content-Length: 202
+Content-Transfer-Encoding: binary
+Content-Type: text/html; name="msg.html"
+
+<html>
+<body>
+<div>
+<font face="Arial">
+<a href="http://buscador.thegestalt.org" onClick="javascript:alert('Danger Will Robinson!')">this </a> should come out in the text
+</font>
+</div>
+</body>
+</html>
+
+--_----------=_110780049136980--
+
Generated at 19:00 on 07 Feb 2005 by mariachi 0.52