#!/usr/local/bin/perl # # This is a filter which attempts to clean up MIME messages with HTML # and binary attachments. It strips the HTML and binary sections and # flattens all multipart sections into one text/plain section. We # keep the MIME encoding type for the plain section. # # We also wrap ASCII-looking text with long lines to make it easier to # reply to messages # # Note: This script probably won't work properly for email with a lot # of international characters. It has only been tested on mailing # lists with english content. # my $szLine = ""; my $fInHeaders = 1; my $fMime = 0; my %rgJunkHeaders = ( "content-type" => 1, "content-transfer-encoding" => 1, ); my $szHeaders = ""; my $szRemoved = ""; my $fForward = 0; # # all of the work is done in this function # # arguments: # boundary for this level. if "" then we are at the top level # sub StripMime { my $szBoundary = shift; my $szContentType = shift; my $szRemoveDepth = shift; my $fTopLevel = ($szBoundary eq "" ? 1 : 0); my $szNewBoundary = ""; my $szContentTransferEncodingHeader = ""; my $szContentTypeHeader = ""; my $fQuotedPrintable = 0; my $cConsecQuotedLines = 0; while (<>) { chomp; my $szThisLine = $_; if (!$fTopLevel && $szThisLine =~ /^--\Q$szBoundary\E($|--$)/) { # # we hit a boundary in a multipart section # if ($1 eq "--") { # # this is the end of the multipart section, return # to our caller # return; } # default to plaintext in all sections unless told otherwise $szContentType = "text/plain"; # we are in headers again $fInHeaders = 1; } elsif ($fInHeaders && $szThisLine eq "") { # # we hit the end of the headers. # $fInHeaders = 0; # if there was a multipart section with a new boundary marker # then we need to recurse into it and clean it up if ($szNewBoundary ne "") { $szRemoved .= "$szRemoveDepth$szContentType\n"; if ($fTopLevel) { $szHeaders = $szHeaders . "X-StripMime: Non-text section removed by stripmime\n"; } StripMime($szNewBoundary, $szContentType, $szRemoveDepth . " "); $szNewBoundary = ""; } else { if ($szContentType eq "text/plain" && $szHeaders ne "") { print $szHeaders; print $szContentTransferEncodingHeader; print $szContentTypeHeader; print "\n"; $szHeaders = ""; } if ($szContentType ne "text/plain" || $szRemoved ne "") { if ($szContentType eq "text/plain") { $szRemoved .= "$szRemoveDepth$szContentType (text body -- kept)\n"; } else { $szRemoved .= "$szRemoveDepth$szContentType\n"; } } } } elsif ($fInHeaders) { # # we are processing headers # my $szHeaderName = ""; # check for a header continuation if ($szThisLine =~ /^\s(.*)$/) { $szLine .= $1; } else { $szLine = $szThisLine; } # get the name of this header if ($szLine =~ /^([\w-]+):/) { $szHeaderName = lc($1); } # get the content type if ($szLine =~ /^Content-type:\s*([^;]*);/i) { $szContentType = lc($1); $szContentTypeHeader = $szLine . "\n"; } # see if this message is forwarded if ($szThisLine =~ /^subject:(\s+fw:|.*\(fwd\)\s*$)/i) { $fForward = 1; } # if the content type is multipart then get the boundary code if ($szLine =~ /^Content-type:\s*multipart\/.*boundary=(\"([^\"]+)\"|([\w+\'\(\)\+,\-.\/:=?]+))/i) { $szNewBoundary = ($2 eq "" ? $3 : $2); } # get the content transfer encoding. if it is quoted-printable # then we will clean it up a bit when working on the body. if ($szLine =~ /^Content-transfer-encoding:\s+(.*)$/i) { $szContentTransferEncodingHeader = $szLine . "\n"; if ($1 =~ /quoted-printable/i) { $fQuotedPrintable = 0; } } # print this header if it is at the top-level and not one # of our junk headers if ($fTopLevel && !(exists $rgJunkHeaders{$szHeaderName})) { $szHeaders = $szHeaders . "$szThisLine\n"; } } elsif ($szContentType eq "text/plain") { # check to see if this is quoted data. we don't allow blocks # of quoted data greater then 50 lines if ($szThisLine =~ /^\>/ && !($fForward)) { $cConsecQuotedLines++; } # if ($cConsecQuotedLines == 50) { # $szThisLine = "[ *** too many quoted lines. automatically truncated *** ]\n"; # } # go through and work on the normal body # if ($cConsecQuotedLines <= 50) { # if the line ends with = then strip it if ($fQuotedPrintable) { # remove trailing = $szThisLine =~ s/=$//; # if a normal character is quoted then unquote it # we need to process left to right (instead of # reprocessing what we've processed) so that we # don't convert something "=3D20" to " " (it # should be "=20" my $szRemainder = $szThisLine; $szThisLine = ""; while ($szRemainder =~ /=([0-9A-F]{2})/) { my $i = hex $1; my $ch = "=$1"; if ($i >= 32 && $i <= 127) { $ch = chr($i); } $szThisLine .= "$`$ch"; $szRemainder = $'; } $szThisLine .= $szRemainder; } # if (length($szThisLine) < 77) { print "$szThisLine\n"; # } else { # # wrap long lines using this format #format = #^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #$szThisLine #. # # figure out the quote characters at the front of # # the line. if they exist then we will put them in # # front of wrapped lines to make the quoting look # # right # $szThisLine =~ /([>\s]+)/; # do { # write; # $szThisLine = $1 . $szThisLine; # } while ($szThisLine ne $1); # } # } } } if ($fTopLevel) { if ($szHeaders ne "" && $szContentType ne "text/plain") { # if there was no text/plain part then we can end up here. # not much that we can do, so we add a special header saying # that the content was HTML only and then put a helpful blurb # in the body print $szHeaders; print "X-StripMime-Failure: no text/plain\n"; print "\n"; print "--- StripMime Report -- processed MIME parts ---\n"; print "$szRemoved"; print "--- StripMime Errors ---\n"; print "A message with no text/plain section was received.\n"; print "The entire body of the message was removed. Please\n"; print "resend the email using plaintext formatting\n"; print "---\n"; } elsif ($szRemoved ne "") { # if we removed some stuff then let the world know print "--- StripMime Report -- processed MIME parts ---\n"; print "$szRemoved"; print "---\n"; } } } # start things going at the top level StripMime("", "text/plain", "");