#!/usr/local/bin/perl
#
# This is a filter which attempts to clean up MIME messages with HTML
# and binary attachments.  It strips the HTML and binary sections and
# flattens all multipart sections into one text/plain section.  We
# keep the MIME encoding type for the plain section.
#
# We also wrap ASCII-looking text with long lines to make it easier to
# reply to messages
#
# Note:  This script probably won't work properly for email with a lot
# of international characters.  It has only been tested on mailing
# lists with english content.
#

my $szLine = "";
my $fInHeaders = 1;
my $fMime = 0;

my %rgJunkHeaders = (
	"content-type"				=> 1,
	"content-transfer-encoding"	=> 1,
);

my $szHeaders = "";
my $szRemoved = "";
my $fForward = 0;

#
# all of the work is done in this function
#
# arguments:
#   boundary for this level.  if "" then we are at the top level
# 
sub StripMime {
	my $szBoundary = shift;
	my $szContentType = shift;
	my $szRemoveDepth = shift;
	my $fTopLevel = ($szBoundary eq "" ? 1 : 0);
	my $szNewBoundary = "";
	my $szContentTransferEncodingHeader = "";
	my $szContentTypeHeader = "";
	my $fQuotedPrintable = 0;
	my $cConsecQuotedLines = 0;
	
	while (<>) {
		chomp;
		my $szThisLine = $_;

		if (!$fTopLevel && $szThisLine =~ /^--\Q$szBoundary\E($|--$)/) {
			#
			# we hit a boundary in a multipart section
			#
			if ($1 eq "--") {
				#
				# this is the end of the multipart section, return
				# to our caller
				#
				return;
			}
			# default to plaintext in all sections unless told otherwise
			$szContentType = "text/plain";
			# we are in headers again
			$fInHeaders = 1;
		} elsif ($fInHeaders && $szThisLine eq "") { 
			#
			# we hit the end of the headers.  
			#
			$fInHeaders = 0;

			# if there was a multipart section with a new boundary marker
			# then we need to recurse into it and clean it up
			if ($szNewBoundary ne "") {
				$szRemoved .= "$szRemoveDepth$szContentType\n";
				if ($fTopLevel) {
					$szHeaders = $szHeaders . "X-StripMime: Non-text section removed by stripmime\n";
				}
				StripMime($szNewBoundary, $szContentType, $szRemoveDepth . "  ");
				$szNewBoundary = "";
			} else {
				if ($szContentType eq "text/plain" && $szHeaders ne "") {
					print $szHeaders;
					print $szContentTransferEncodingHeader;
					print $szContentTypeHeader;
					print "\n";
					$szHeaders = "";
				}
				if ($szContentType ne "text/plain" || $szRemoved ne "") {
					if ($szContentType eq "text/plain") {
						$szRemoved .= "$szRemoveDepth$szContentType (text body -- kept)\n";
					} else {
						$szRemoved .= "$szRemoveDepth$szContentType\n";
					}
				}
			}
		} elsif ($fInHeaders) {
			#
			# we are processing headers
			#

			my $szHeaderName = "";

			# check for a header continuation
			if ($szThisLine =~ /^\s(.*)$/) {
				$szLine .= $1;
			} else {
				$szLine = $szThisLine;
			}

			# get the name of this header
			if ($szLine =~ /^([\w-]+):/) { 
				$szHeaderName = lc($1); 
			}

			# get the content type
			if ($szLine =~ /^Content-type:\s*([^;]*);/i) { 
				$szContentType = lc($1); 
				$szContentTypeHeader = $szLine . "\n";
			}

			# see if this message is forwarded
			if ($szThisLine =~ /^subject:(\s+fw:|.*\(fwd\)\s*$)/i) {
				$fForward = 1;
			}

			# if the content type is multipart then get the boundary code
			if ($szLine =~ /^Content-type:\s*multipart\/.*boundary=(\"([^\"]+)\"|([\w+\'\(\)\+,\-.\/:=?]+))/i) {
				$szNewBoundary = ($2 eq "" ? $3 : $2);
			}

			# get the content transfer encoding.  if it is quoted-printable
			# then we will clean it up a bit when working on the body.
			if ($szLine =~ /^Content-transfer-encoding:\s+(.*)$/i) {
				$szContentTransferEncodingHeader = $szLine . "\n";
				if ($1 =~ /quoted-printable/i) {
					$fQuotedPrintable = 0;
				}
			}

			# print this header if it is at the top-level and not one
			# of our junk headers
			if ($fTopLevel && !(exists $rgJunkHeaders{$szHeaderName})) {
				$szHeaders = $szHeaders . "$szThisLine\n";
			}
		} elsif ($szContentType eq "text/plain") {
			# check to see if this is quoted data.  we don't allow blocks
			# of quoted data greater then 50 lines
			if ($szThisLine =~ /^\>/ && !($fForward)) {
				$cConsecQuotedLines++;
			}
#			if ($cConsecQuotedLines == 50) {
#				$szThisLine = "[ *** too many quoted lines.  automatically truncated *** ]\n";
#			} 

			# go through and work on the normal body
#			if ($cConsecQuotedLines <= 50) {
				# if the line ends with = then strip it
				if ($fQuotedPrintable) {
					# remove trailing =
					$szThisLine =~ s/=$//;

					# if a normal character is quoted then unquote it
					# we need to process left to right (instead of
					# reprocessing what we've processed) so that we
					# don't convert something "=3D20" to " " (it
					# should be "=20"
					my $szRemainder = $szThisLine;
					$szThisLine = "";

					while ($szRemainder =~ /=([0-9A-F]{2})/) {
						my $i = hex $1;
						my $ch = "=$1";
						if ($i >= 32 && $i <= 127) {
							$ch = chr($i);
						} 
						$szThisLine .= "$`$ch";
						$szRemainder = $';
					}
					$szThisLine .= $szRemainder;
				}

#				if (length($szThisLine) < 77) {
					print "$szThisLine\n";
#				} else {
#					# wrap long lines using this format
#format =
#^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#$szThisLine
#.
#					# figure out the quote characters at the front of
#					# the line.  if they exist then we will put them in
#					# front of wrapped lines to make the quoting look
#					# right
#					$szThisLine =~ /([>\s]+)/;
#					do {
#						write;
#						$szThisLine = $1 . $szThisLine;
#					} while ($szThisLine ne $1);
#				}
#			}
		} 
	}

	if ($fTopLevel) {
		if ($szHeaders ne "" && $szContentType ne "text/plain") {
			# if there was no text/plain part then we can end up here.
			# not much that we can do, so we add a special header saying
			# that the content was HTML only and then put a helpful blurb
			# in the body
			print $szHeaders;
			print "X-StripMime-Failure: no text/plain\n";
			print "\n";
			print "--- StripMime Report -- processed MIME parts ---\n";
			print "$szRemoved";
			print "--- StripMime Errors ---\n";
			print "A message with no text/plain section was received.\n";
			print "The entire body of the message was removed.  Please\n";
			print "resend the email using plaintext formatting\n";
			print "---\n";
		} elsif ($szRemoved ne "") {
			# if we removed some stuff then let the world know
			print "--- StripMime Report -- processed MIME parts ---\n";
			print "$szRemoved";
			print "---\n";
		}
	}
}

# start things going at the top level
StripMime("", "text/plain", "");
