rev 1835 - in trunk: . Email-Store-HTML Email-Store-HTML/lib Email-Store-HTML/lib/Email Email-Store-HTML/lib/Email/Store Email-Store-HTML/t

[prev] [thread] [next] [lurker] [Date index for 2005/02/07]

From: simon
Subject: rev 1835 - in trunk: . Email-Store-HTML Email-Store-HTML/lib Email-Store-HTML/lib/Email Email-Store-HTML/lib/Email/Store Email-Store-HTML/t
Date: 18:28 on 07 Feb 2005
Author: simon
Date: 2005-02-07 18:28:43 +0000 (Mon, 07 Feb 2005)
New Revision: 1835

Added:
   trunk/Email-Store-HTML/
   trunk/Email-Store-HTML/Build.PL
   trunk/Email-Store-HTML/MANIFEST.SKIP
   trunk/Email-Store-HTML/lib/
   trunk/Email-Store-HTML/lib/Email/
   trunk/Email-Store-HTML/lib/Email/Store/
   trunk/Email-Store-HTML/lib/Email/Store/HTML.pm
   trunk/Email-Store-HTML/svn-commit.tmp
   trunk/Email-Store-HTML/t/
   trunk/Email-Store-HTML/t/00compile.t
   trunk/Email-Store-HTML/t/01basic.t
   trunk/Email-Store-HTML/t/htmltest.mail
Log:
Initial import of HTML stripping plugin for Email::Store


Added: trunk/Email-Store-HTML/Build.PL
===================================================================
--- trunk/Email-Store-HTML/Build.PL	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/Build.PL	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,15 @@
+use strict;
+use Module::Build;
+Module::Build->new(
+    module_name => 'Email::Store::HTML',
+    license     => 'perl',
+    requires    =>    {
+                         'perl' => 5.006,
+                         'Email::Store' => 0,
+                      },
+    build_requires => {
+                         'Test::More' => 0,
+                         'DBD::SQLite' => 0,
+                      },
+    create_makefile_pl => 'traditional',
+)->create_build_script;

Added: trunk/Email-Store-HTML/MANIFEST.SKIP
===================================================================
--- trunk/Email-Store-HTML/MANIFEST.SKIP	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/MANIFEST.SKIP	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,20 @@
+CVS/.*
+\.svn/.*
+\.cvsignore$
+\.Inline/.*
+_Inline/.*
+\.bak$
+\.tar$
+\.tgz$
+\.tar\.gz$
+~$
+^mess/
+^tmp/
+^testdata/
+^blib/
+^Makefile$
+^Makefile\.[a-z]+$
+^Build$
+^pm_to_blib$
+^_build/.*
+~$
\ No newline at end of file

Added: trunk/Email-Store-HTML/lib/Email/Store/HTML.pm
===================================================================
--- trunk/Email-Store-HTML/lib/Email/Store/HTML.pm	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/lib/Email/Store/HTML.pm	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,125 @@
+package Email::Store::HTML;
+use base "Email::Store::DBI";
+use strict;
+use Email::Store::Mail;
+__PACKAGE__->table("html_body");
+__PACKAGE__->columns( All => qw[ id mail raw scrubbed as_text  ] );
+__PACKAGE__->columns( Primary => qw/id/);
+Email::Store::HTML->has_a(mail => "Email::Store::Mail");
+__PACKAGE__->add_constructor(from_mail => 'mail = ?');                      
+                                                                                
+
+
+use HTML::Scrubber;
+use HTML::FormatText::WithLinks;        
+use vars qw($VERSION @allow @rules @default);
+
+$VERSION = "0.1";
+
+sub on_store_order { 2 }
+
+sub on_store {
+    my ($self, $mail) = @_;
+
+    # create the text formatter    
+    my $f = HTML::FormatText::WithLinks->new( 
+        before_link => '',
+        after_link  => ' [ %l ]',
+        footnote    => ''
+    );
+
+
+    # create the scrubber
+    my $scrubber = HTML::Scrubber->new(
+        allow   => \@allow,
+        rules   => \@rules,
+        default => \@default,
+        comment => 1,
+        process => 0,
+    );
+
+
+
+
+    for ($mail->attachments) {
+        next unless $_->content_type eq 'text/html';
+        my $raw      =  $_->payload;
+        my $scrubbed =  $scrubber->scrub($raw);
+        my $text     =  $f->parse($raw); 
+        Email::Store::HTML->create( { mail => $mail->id, raw => $raw, scrubbed => $scrubbed, as_text => $text } );
+        last;
+    }
+}
+
+###
+# Configuration for HTML::Scrubber
+###
+
+my @allow = qw[ br hr b a p pre ul ol li i em strong table tr td th div ];
+                                                                            #
+my @rules = (
+        script => 0,
+        img => {
+            border => 1,
+            alt => 1,                 # alt attribute allowed
+            '*' => 0,                 # deny all other attributes
+        },
+);
+                                                                            #
+my @default = (
+        0   =>    # default rule, deny all tags
+        {
+            '*'           => 1, # default rule, allow all attributes
+            'href'        => qr{^(?!(?:java)?script)}i,
+            'src'         => qr{^(?!(?:java)?script)}i,
+            'cite'        => '(?i-xsm:^(?!(?:java)?script))',
+            'language'    => 0,
+            'name'        => 1, # could be sneaky, but hey ;)
+            'onblur'      => 0,
+            'onchange'    => 0,
+            'onclick'     => 0,
+            'ondblclick'  => 0,
+            'onerror'     => 0,
+            'onfocus'     => 0,
+            'onkeydown'   => 0,
+            'onkeypress'  => 0,
+            'onkeyup'     => 0,
+            'onload'      => 0,
+            'onmousedown' => 0,
+            'onmousemove' => 0,
+            'onmouseout'  => 0,
+            'onmouseover' => 0,
+            'onmouseup'   => 0,
+            'onreset'     => 0,
+            'onselect'    => 0,
+            'onsubmit'    => 0,
+            'onunload'    => 0,
+            'src'         => 0,
+            'type'        => 0,
+        }
+    );
+
+package Email::Store::Mail;                                                                    
+sub html {                                                                       
+  my ($self) = @_;                                                              
+  my ($html) = Email::Store::HTML->from_mail($self->message_id);                                           
+  return $html;
+}       
+
+package Email::Store::HTML;
+1;
+
+__DATA__
+
+CREATE TABLE IF NOT EXISTS html_body (
+    id integer NOT NULL auto_increment primary key,
+    mail         varchar(255) NOT NULL,
+    raw          text,
+    scrubbed     text,
+    as_text      text
+);
+
+
+
+
+

Added: trunk/Email-Store-HTML/svn-commit.tmp
===================================================================
--- trunk/Email-Store-HTML/svn-commit.tmp	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/svn-commit.tmp	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,4 @@
+Initial import of HTML stripping stuff
+--This line, and those below, will be ignored--
+
+A    .

Added: trunk/Email-Store-HTML/t/00compile.t
===================================================================
--- trunk/Email-Store-HTML/t/00compile.t	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/t/00compile.t	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,3 @@
+# -*- cperl -*-
+use Test::More tests => 1;
+require_ok( "Email::Store::HTML" );

Added: trunk/Email-Store-HTML/t/01basic.t
===================================================================
--- trunk/Email-Store-HTML/t/01basic.t	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/t/01basic.t	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,27 @@
+use Test::More tests => 13;
+use File::Slurp;
+BEGIN { unlink("t/test.db"); }
+use Email::Store "dbi:SQLite:dbname=t/test.db";
+Email::Store->setup( );
+ok(1, "Set up");
+
+my $data = read_file("t/htmltest.mail");
+Email::Store::Mail->store($data);
+my ($m) = Email::Store::Mail->retrieve_all(); #('myfakeid@localhost');
+ok($m, "Got the mail back");
+
+
+my ($html, $body, $raw, $scrubbed, $as_text);
+ok($html     = $m->html,      "Got html");
+ok($body     = $m->simple->body,   "Got body");
+ok($raw      = $html->raw,      "Got raw");
+ok($scrubbed = $html->scrubbed, "Got scrubbed");
+ok($as_text  = $html->as_text,  "Got text");
+
+unlike($body,     qr/</, "No html in body");
+like($raw,        qr/</, "Got html in raw");
+like($raw,        qr/javascript/, "Got javascript in raw");
+unlike($scrubbed, qr/javascript/, "No html in body");
+unlike($as_text,  qr/</, "No html in text");
+like($as_text,    qr!this [ http://buscador.thegestalt.org ]!, "Text has link and sentence");
+

Added: trunk/Email-Store-HTML/t/htmltest.mail
===================================================================
--- trunk/Email-Store-HTML/t/htmltest.mail	2005-02-07 17:08:07 UTC (rev 1834)
+++ trunk/Email-Store-HTML/t/htmltest.mail	2005-02-07 18:28:43 UTC (rev 1835)
@@ -0,0 +1,36 @@
+Content-Transfer-Encoding: binary
+Content-Type: multipart/mixed; boundary="_----------=_110780049136980"
+MIME-Version: 1.0
+X-Mailer: MIME::Lite 3.01 (F2.71; T1.13; A1.58; B3.05; Q3.03)
+Date: Mon, 7 Feb 2005 18:21:31 UT
+From: someone@xxxxxxx.xxx
+To: simon@xxxxxxx.xxx
+Subject: Mixed part message
+
+This is a multi-part message in MIME format.
+
+--_----------=_110780049136980
+Content-Disposition: inline
+Content-Length: 65
+Content-Transfer-Encoding: binary
+Content-Type: text/plain
+
+This (http://buscador.thegestalt.org) should come out in the text
+--_----------=_110780049136980
+Content-Disposition: attachment; filename="msg.html"
+Content-Length: 202
+Content-Transfer-Encoding: binary
+Content-Type: text/html; name="msg.html"
+
+<html>
+<body>
+<div>
+<font face="Arial">
+<a href="http://buscador.thegestalt.org"; onClick="javascript:alert('Danger Will Robinson!')">this </a> should come out in the text
+</font>
+</div>
+</body>
+</html>
+
+--_----------=_110780049136980--
+

Generated at 19:00 on 07 Feb 2005 by mariachi 0.52