#!/usr/bin/perl
#
# This is a filter which attempts to clean up MIME messages with HTML
# and binary attachments.  It strips the HTML and binary sections and
# flattens all multipart sections into one text/plain section.  We
# keep the MIME encoding type for the plain section.
#
# Note:  This script probably won't work properly for email with a lot
# of international characters.  It has only been tested on mailing
# lists with english content.
#
# More information and the latest version can be found at
# http://www.phred.org/~alex/stripmime.html
#
# Change History:  
# 1.0 - first public version 
# 1.1 - HTML conversion - Mar 11 2001
#
# This code is Copyright 2000-2001 Alex Wetmore. 
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met: 
#
# 1.  Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer. 
#
# 2.  Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with
# the distribution. 
#
# 3.  All advertising materials mentioning features or use of this
# software must display the following acknowledgement:  This product
# includes software developed by Alex Wetmore. 
#
# 4.  The name of Alex Wetmore may not be used to endorse or promote
# products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#

my $szLine = "";
my $fInHeaders = 1;
my $fMime = 0;

my %rgJunkHeaders = (
	"content-type"				=> 1,
	"content-transfer-encoding"	=> 1,
);

my $szHeaders = "";
my $szRemoved = "";
my $fForward = 0;

# This is not at all complete.  I just did the easy ones.
my %hHTMLCharConversion = (
	"nbsp"		=> " ",
	"gt"		=> ">",
	"lt"		=> "<",
	"amp"		=> "&",
	"and"		=> "&&",
	"asymp"		=> "=~",
	"brvbar"	=> "|",
	"bull"		=> "*",
	"cong"		=> "=~",
	"copy"		=> "(c)",
	"crarr"		=> "<cr>",
	"equiv"		=> "==",
	"ge"		=> "<=",
	"lang"		=> "<",
	"lsquo"		=> "\'",
	"mdash"		=> "--",
	"minus"		=> "-",
	"reg"		=> "(tm)",
	"sim"		=> "~",
	"thinsp"	=> " ",
	"shy"		=> "-",
	"trade"		=> "(tm)",
	"times"		=> "*",
);

# here I am mostly picking ones which have a large effect on layout
my %hHTMLTagConversion = (
	"/div"		=> "\n",
	"/p"		=> "\n",
	"hr"		=> "\n------------------------------------------\n",
	"br"		=> "\n",
	"li"		=> " * ",
);

#
# all of the work is done in this function
#
# arguments:
#   boundary for this level.  if "" then we are at the top level
# 
sub StripMime {
	my $szBoundary = shift;
	my $szContentType = shift;
	my $szRemoveDepth = shift;
	my $fTopLevel = ($szBoundary eq "" ? 1 : 0);
	my $szNewBoundary = "";
	my $szContentTransferEncodingHeader = "";
	my $szContentTypeHeader = "";
	my $fQuotedPrintable = 0;
	my $cConsecQuotedLines = 0;
	my $fConvertHTML = 0;
	my $fInTag = 0;
	
	while (<>) {
		chomp;
		my $szThisLine = $_;

		if (!$fTopLevel && $szThisLine =~ /^--\Q$szBoundary\E($|--$)/) {
			#
			# we hit a boundary in a multipart section
			#
			if ($1 eq "--") {
				#
				# this is the end of the multipart section, return
				# to our caller
				#
				return;
			}
			# default to plaintext in all sections unless told otherwise
			$szContentType = "text/plain";
			# we are in headers again
			$fInHeaders = 1;
		} elsif ($fInHeaders && $szThisLine eq "") { 
			#
			# we hit the end of the headers.  
			#
			$fInHeaders = 0;

			# if there was a multipart section with a new boundary marker
			# then we need to recurse into it and clean it up
			if ($szNewBoundary ne "") {
				$szRemoved .= "$szRemoveDepth$szContentType\n";
				if ($fTopLevel) {
					$szHeaders = $szHeaders . "X-StripMime: Non-text section removed by stripmime\n";
				}
				StripMime($szNewBoundary, $szContentType, $szRemoveDepth . "  ");
				$szNewBoundary = "";
			} else {
				if (($szContentType eq "text/plain" || $fConvertHTML) && $szHeaders ne "") {
					print $szHeaders;
					print $szContentTransferEncodingHeader;
					print $szContentTypeHeader;
					print "\n";
					$szHeaders = "";
				}
				if ($szContentType ne "text/plain" || $szRemoved ne "") {
					if ($szContentType eq "text/plain") {
						$szRemoved .= "$szRemoveDepth$szContentType  (text body -- kept)\n";
					} elsif ($szContentType eq "text/html" && $fConvertHTML) {
						$szRemoved .= "$szRemoveDepth$szContentType  (html body -- converted)\n";
					} else {
						$szRemoved .= "$szRemoveDepth$szContentType  (removed)\n";
					}
				}
			}
		} elsif ($fInHeaders) {
			#
			# we are processing headers
			#

			my $szHeaderName = "";

			# check for a header continuation
			if ($szThisLine =~ /^\s(.*)$/) {
				$szLine .= $1;
			} else {
				$szLine = $szThisLine;
			}

			# get the name of this header
			if ($szLine =~ /^([\w-]+):/) { 
				$szHeaderName = lc($1); 
			}

			# get the content type
			if ($szLine =~ /^Content-type:\s*([^;]*)(;|$)/i) { 
				$szContentType = lc($1);
				$szContentTypeHeader = $szLine . "\n";

				if ($szContentType eq "text/html" && $fTopLevel) {
					$fConvertHTML = 1;
					$szContentTypeHeader = "Content-Type: text/plain\n";
				}
			}

			# see if this message is forwarded
			if ($szThisLine =~ /^subject:(\s+fw:|.*\(fwd\)\s*$)/i) {
				$fForward = 1;
			}

			# if the content type is multipart then get the boundary code
			if ($szLine =~ /^Content-type:\s*multipart\/.*boundary=(\"([^\"]+)\"|([\w+\'\(\)\+,\-.\/:=?]+))/i) {
				$szNewBoundary = ($2 eq "" ? $3 : $2);
			}

			# get the content transfer encoding.  if it is quoted-printable
			# then we will clean it up a bit when working on the body.
			if ($szLine =~ /^Content-transfer-encoding:\s+(.*)$/i) {
				$szContentTransferEncodingHeader = $szLine . "\n";
				if ($1 =~ /quoted-printable/i) {
					$fQuotedPrintable = 1;
				}
			}

			# print this header if it is at the top-level and not one
			# of our junk headers
			if ($fTopLevel && !(exists $rgJunkHeaders{$szHeaderName})) {
				$szHeaders = $szHeaders . "$szThisLine\n";
			}
		} elsif ($szContentType eq "text/plain" | $fConvertHTML) {
			# go through and work on the normal body
			# if the line ends with = then strip it
			#
			# This is pretty stupid!! Removing trailing '=' does
			# not decode quoted-printable properly and makes line
			# breaks look stupid, nor does this filter adjust the
			# Content-transfer-encoding: accordingly anywhere.
			if (0 & $fQuotedPrintable) {
				# remove trailing =
				$szThisLine =~ s/=$//;

				# if a normal character is quoted then unquote it
				# we need to process left to right (instead of
				# reprocessing what we've processed) so that we
				# don't convert something "=3D20" to " " (it
				# should be "=20"
				my $szRemainder = $szThisLine;
				$szThisLine = "";

				while ($szRemainder =~ /=([0-9A-F]{2})/) {
					my $i = hex $1;
					my $ch = "=$1";
					if ($i >= 32) {
						$ch = chr($i);
					} 
					$szThisLine .= "$`$ch";
					$szRemainder = $';
				}
				$szThisLine .= $szRemainder;
			}

			# we have a very simple parser for HTML.  we go through
			# each line, removing tags until none are left.  when
			# we find tags we look them up in %hHTMLTagConversion
			# and convert them into ascii if they are in that
			# table.  The goal here is to get nice looking ASCII
			# text with a minimum of real work
			if ($fConvertHTML) {
				my $szBeforeTag;
				my $szInTag;
				my $iStartTag;
				my $iEndTag;
				my $szAfterTag;
				my $szTagConversion = "";
				my $szTag;
				if ($fInTag) {
					if (($iEndTag = index($szThisLine, ">")) != -1) {
						$szThisLine = substr $szThisLine, $iEndTag+1;
					} else {
						$szThisLine = "";
					}
				}
				while (($iStartTag = index($szThisLine, "<")) != -1) {
					$szBeforeTag = substr $szThisLine, 0, $iStartTag;
					$szInTag = substr $szThisLine, $iStartTag+1;
					if (($iEndTag = index($szInTag, ">")) != -1) {
						# we have the end of the tag on this line
						$szTag = substr $szInTag, 0, $iEndTag;
						$szAfterTag = substr $szInTag, $iEndTag+1;
					} else {
						# the tag extends to another line
						$szTag = $szInTag;
						$fInTag = 1;
						$szAfterTag = "";
					}
					$szTag = lc $szTag;	
					if (exists $hHTMLTagConversion{$szTag}) {
						$szTagConversion = $hHTMLTagConversion{$szTag};
					}
					$szThisLine = "$szBeforeTag$szTagConversion$szAfterTag";
				}

				# now that we have removed all tags we go through and
				# convert special characters back to their ascii
				# equivelents
				while ($szThisLine =~ /&(\w*);/) {
					my $szBefore = $`;
					my $szAscii;
					my $szEntity = $1;
					my $szAfter = $';
					if ($szEntity =~ /^#(\n\n\n)/) {
						if ($1 < 255) {
							# convert a numeric code
							$szAscii = chr $1;
						} else {
							$szAscii = "{$szEntity}";
						}
					} elsif (exists $hHTMLCharConversion{$szEntity}) {
						$szAscii = $hHTMLCharConversion{$szEntity};
					} else {
						# don't know how to convert
						$szAscii = "{$szEntity}";
					}
					$szThisLine = "$szBefore$szAscii$szAfter";
				}
			}

#			if (length($szThisLine) < 77) {
				print "$szThisLine\n";
#			} else {
#				# wrap long lines using this format
#format =
#^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#$szThisLine
#.
#				# figure out the quote characters at the front of
#				# the line.  if they exist then we will put them in
#				# front of wrapped lines to make the quoting look
#				# right
#				$szThisLine =~ /([>\s]+)/;
#				do {
#					write;
#					$szThisLine = $1 . $szThisLine;
#				} while ($szThisLine ne $1);
#			}
		}
	}

	if ($fTopLevel) {
		if ($szHeaders ne "" && $szContentType ne "text/plain") {
			# if there was no text/plain part then we can end up here.
			# not much that we can do, so we add a special header saying
			# that the content was HTML only and then put a helpful blurb
			# in the body
			print $szHeaders;
			print "X-StripMime-Failure: no text/plain\n";
			print "\n";
			print "--- StripMime Report -- processed MIME parts ---\n";
			print "$szRemoved";
			print "--- StripMime Errors ---\n";
			print "A message with no text/plain section was received.\n";
			print "The entire body of the message was removed.  Please\n";
			print "resend the email using plaintext formatting.\n";
			print "---\n";
		} elsif ($szRemoved ne "") {
			# if we removed some stuff then let the world know
			print "\n";
			print "--- StripMime Report -- processed MIME parts ---\n";
			print "$szRemoved";
			print "---\n";
		}
	}
}

# start things going at the top level
StripMime("", "text/plain", "");