[prev] [thread] [next] [lurker] [Date index for 2005/02/07]
Author: simon Date: 2005-02-07 18:28:43 +0000 (Mon, 07 Feb 2005) New Revision: 1835 Added: trunk/Email-Store-HTML/ trunk/Email-Store-HTML/Build.PL trunk/Email-Store-HTML/MANIFEST.SKIP trunk/Email-Store-HTML/lib/ trunk/Email-Store-HTML/lib/Email/ trunk/Email-Store-HTML/lib/Email/Store/ trunk/Email-Store-HTML/lib/Email/Store/HTML.pm trunk/Email-Store-HTML/svn-commit.tmp trunk/Email-Store-HTML/t/ trunk/Email-Store-HTML/t/00compile.t trunk/Email-Store-HTML/t/01basic.t trunk/Email-Store-HTML/t/htmltest.mail Log: Initial import of HTML stripping plugin for Email::Store Added: trunk/Email-Store-HTML/Build.PL =================================================================== --- trunk/Email-Store-HTML/Build.PL 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/Build.PL 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,15 @@ +use strict; +use Module::Build; +Module::Build->new( + module_name => 'Email::Store::HTML', + license => 'perl', + requires => { + 'perl' => 5.006, + 'Email::Store' => 0, + }, + build_requires => { + 'Test::More' => 0, + 'DBD::SQLite' => 0, + }, + create_makefile_pl => 'traditional', +)->create_build_script; Added: trunk/Email-Store-HTML/MANIFEST.SKIP =================================================================== --- trunk/Email-Store-HTML/MANIFEST.SKIP 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/MANIFEST.SKIP 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,20 @@ +CVS/.* +\.svn/.* +\.cvsignore$ +\.Inline/.* +_Inline/.* +\.bak$ +\.tar$ +\.tgz$ +\.tar\.gz$ +~$ +^mess/ +^tmp/ +^testdata/ +^blib/ +^Makefile$ +^Makefile\.[a-z]+$ +^Build$ +^pm_to_blib$ +^_build/.* +~$ \ No newline at end of file Added: trunk/Email-Store-HTML/lib/Email/Store/HTML.pm =================================================================== --- trunk/Email-Store-HTML/lib/Email/Store/HTML.pm 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/lib/Email/Store/HTML.pm 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,125 @@ +package Email::Store::HTML; +use base "Email::Store::DBI"; +use strict; +use Email::Store::Mail; +__PACKAGE__->table("html_body"); +__PACKAGE__->columns( All => qw[ id mail raw scrubbed as_text ] ); +__PACKAGE__->columns( Primary => qw/id/); +Email::Store::HTML->has_a(mail => "Email::Store::Mail"); +__PACKAGE__->add_constructor(from_mail => 'mail = ?'); + + + +use HTML::Scrubber; +use HTML::FormatText::WithLinks; +use vars qw($VERSION @allow @rules @default); + +$VERSION = "0.1"; + +sub on_store_order { 2 } + +sub on_store { + my ($self, $mail) = @_; + + # create the text formatter + my $f = HTML::FormatText::WithLinks->new( + before_link => '', + after_link => ' [ %l ]', + footnote => '' + ); + + + # create the scrubber + my $scrubber = HTML::Scrubber->new( + allow => \@allow, + rules => \@rules, + default => \@default, + comment => 1, + process => 0, + ); + + + + + for ($mail->attachments) { + next unless $_->content_type eq 'text/html'; + my $raw = $_->payload; + my $scrubbed = $scrubber->scrub($raw); + my $text = $f->parse($raw); + Email::Store::HTML->create( { mail => $mail->id, raw => $raw, scrubbed => $scrubbed, as_text => $text } ); + last; + } +} + +### +# Configuration for HTML::Scrubber +### + +my @allow = qw[ br hr b a p pre ul ol li i em strong table tr td th div ]; + # +my @rules = ( + script => 0, + img => { + border => 1, + alt => 1, # alt attribute allowed + '*' => 0, # deny all other attributes + }, +); + # +my @default = ( + 0 => # default rule, deny all tags + { + '*' => 1, # default rule, allow all attributes + 'href' => qr{^(?!(?:java)?script)}i, + 'src' => qr{^(?!(?:java)?script)}i, + 'cite' => '(?i-xsm:^(?!(?:java)?script))', + 'language' => 0, + 'name' => 1, # could be sneaky, but hey ;) + 'onblur' => 0, + 'onchange' => 0, + 'onclick' => 0, + 'ondblclick' => 0, + 'onerror' => 0, + 'onfocus' => 0, + 'onkeydown' => 0, + 'onkeypress' => 0, + 'onkeyup' => 0, + 'onload' => 0, + 'onmousedown' => 0, + 'onmousemove' => 0, + 'onmouseout' => 0, + 'onmouseover' => 0, + 'onmouseup' => 0, + 'onreset' => 0, + 'onselect' => 0, + 'onsubmit' => 0, + 'onunload' => 0, + 'src' => 0, + 'type' => 0, + } + ); + +package Email::Store::Mail; +sub html { + my ($self) = @_; + my ($html) = Email::Store::HTML->from_mail($self->message_id); + return $html; +} + +package Email::Store::HTML; +1; + +__DATA__ + +CREATE TABLE IF NOT EXISTS html_body ( + id integer NOT NULL auto_increment primary key, + mail varchar(255) NOT NULL, + raw text, + scrubbed text, + as_text text +); + + + + + Added: trunk/Email-Store-HTML/svn-commit.tmp =================================================================== --- trunk/Email-Store-HTML/svn-commit.tmp 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/svn-commit.tmp 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,4 @@ +Initial import of HTML stripping stuff +--This line, and those below, will be ignored-- + +A . Added: trunk/Email-Store-HTML/t/00compile.t =================================================================== --- trunk/Email-Store-HTML/t/00compile.t 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/t/00compile.t 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,3 @@ +# -*- cperl -*- +use Test::More tests => 1; +require_ok( "Email::Store::HTML" ); Added: trunk/Email-Store-HTML/t/01basic.t =================================================================== --- trunk/Email-Store-HTML/t/01basic.t 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/t/01basic.t 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,27 @@ +use Test::More tests => 13; +use File::Slurp; +BEGIN { unlink("t/test.db"); } +use Email::Store "dbi:SQLite:dbname=t/test.db"; +Email::Store->setup( ); +ok(1, "Set up"); + +my $data = read_file("t/htmltest.mail"); +Email::Store::Mail->store($data); +my ($m) = Email::Store::Mail->retrieve_all(); #('myfakeid@localhost'); +ok($m, "Got the mail back"); + + +my ($html, $body, $raw, $scrubbed, $as_text); +ok($html = $m->html, "Got html"); +ok($body = $m->simple->body, "Got body"); +ok($raw = $html->raw, "Got raw"); +ok($scrubbed = $html->scrubbed, "Got scrubbed"); +ok($as_text = $html->as_text, "Got text"); + +unlike($body, qr/</, "No html in body"); +like($raw, qr/</, "Got html in raw"); +like($raw, qr/javascript/, "Got javascript in raw"); +unlike($scrubbed, qr/javascript/, "No html in body"); +unlike($as_text, qr/</, "No html in text"); +like($as_text, qr!this [ http://buscador.thegestalt.org ]!, "Text has link and sentence"); + Added: trunk/Email-Store-HTML/t/htmltest.mail =================================================================== --- trunk/Email-Store-HTML/t/htmltest.mail 2005-02-07 17:08:07 UTC (rev 1834) +++ trunk/Email-Store-HTML/t/htmltest.mail 2005-02-07 18:28:43 UTC (rev 1835) @@ -0,0 +1,36 @@ +Content-Transfer-Encoding: binary +Content-Type: multipart/mixed; boundary="_----------=_110780049136980" +MIME-Version: 1.0 +X-Mailer: MIME::Lite 3.01 (F2.71; T1.13; A1.58; B3.05; Q3.03) +Date: Mon, 7 Feb 2005 18:21:31 UT +From: someone@xxxxxxx.xxx +To: simon@xxxxxxx.xxx +Subject: Mixed part message + +This is a multi-part message in MIME format. + +--_----------=_110780049136980 +Content-Disposition: inline +Content-Length: 65 +Content-Transfer-Encoding: binary +Content-Type: text/plain + +This (http://buscador.thegestalt.org) should come out in the text +--_----------=_110780049136980 +Content-Disposition: attachment; filename="msg.html" +Content-Length: 202 +Content-Transfer-Encoding: binary +Content-Type: text/html; name="msg.html" + +<html> +<body> +<div> +<font face="Arial"> +<a href="http://buscador.thegestalt.org" onClick="javascript:alert('Danger Will Robinson!')">this </a> should come out in the text +</font> +</div> +</body> +</html> + +--_----------=_110780049136980-- +
Generated at 19:00 on 07 Feb 2005 by mariachi 0.52