libkmime

kmime_charfreq.cpp
1 /*
2  kmime_charfreq.cpp
3 
4  KMime, the KDE internet mail/usenet news message library.
5  Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
6 
7  This program is free software; you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation; version 2 of the License.
10  You should have received a copy of the GNU General Public License
11  along with this program; if not, write to the Free Software Foundation,
12  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
13 */
14 
15 #include "kmime_charfreq.h"
16 
17 namespace KMime {
18 
19 CharFreq::CharFreq( const TQByteArray & buf )
20  : NUL(0),
21  CTL(0),
22  CR(0), LF(0),
23  CRLF(0),
24  printable(0),
25  eightBit(0),
26  total(0),
27  lineMin(0xffffffff),
28  lineMax(0),
29  mTrailingWS(false),
30  mLeadingFrom(false)
31 {
32  if ( !buf.isEmpty() )
33  count( buf.data(), buf.size() );
34 }
35 
36 CharFreq::CharFreq( const char * buf, size_t len )
37  : NUL(0),
38  CTL(0),
39  CR(0), LF(0),
40  CRLF(0),
41  printable(0),
42  eightBit(0),
43  total(0),
44  lineMin(0xffffffff),
45  lineMax(0),
46  mTrailingWS(false),
47  mLeadingFrom(false)
48 {
49  if ( buf && len > 0 )
50  count( buf, len );
51 }
52 
53 static inline bool isWS( char ch ) { return ( ch == '\t' || ch == ' ' ); }
54 
55 void CharFreq::count( const char * it, size_t len ) {
56 
57  const char * end = it + len;
58  uint currentLineLength = 0;
59  // initialize the prevChar with LF so that From_ detection works w/o
60  // special-casing:
61  char prevChar = '\n';
62  char prevPrevChar = 0;
63 
64  for ( ; it != end ; ++it ) {
65  ++currentLineLength;
66  switch ( *it ) {
67  case '\0': ++NUL; break;
68  case '\r': ++CR; break;
69  case '\n': ++LF;
70  if ( prevChar == '\r' ) { --currentLineLength; ++CRLF; }
71  if ( currentLineLength >= lineMax ) lineMax = currentLineLength-1;
72  if ( currentLineLength <= lineMin ) lineMin = currentLineLength-1;
73  if ( !mTrailingWS )
74  if ( isWS( prevChar ) || ( prevChar == '\r' && isWS( prevPrevChar ) ) )
75  mTrailingWS = true;
76  currentLineLength = 0;
77  break;
78  case 'F': // check for lines starting with From_ if not found already:
79  if ( !mLeadingFrom )
80  if ( prevChar == '\n' && end - it >= 5 && !tqstrncmp( "From ", it, 5 ) )
81  mLeadingFrom = true;
82  ++printable;
83  break;
84  default:
85  {
86  uchar c = *it;
87  if ( (c == '\t') || ((c >= ' ') && (c <= '~')) )
88  ++printable;
89  else if ( (c == 127) || (c < ' ') )
90  ++CTL;
91  else
92  ++eightBit;
93  }
94  }
95  prevPrevChar = prevChar;
96  prevChar = *it;
97  }
98 
99  // consider the length of the last line
100  if ( currentLineLength >= lineMax ) lineMax = currentLineLength;
101  if ( currentLineLength <= lineMin ) lineMin = currentLineLength;
102 
103  // check whether the last character is tab or space
104  if ( isWS( prevChar ) )
105  mTrailingWS = true;
106 
107  total = len;
108 }
109 
110 bool CharFreq::isEightBitData() const {
111  return type() == EightBitData;
112 }
113 
114 bool CharFreq::isEightBitText() const {
115  return type() == EightBitText;
116 }
117 
118 bool CharFreq::isSevenBitData() const {
119  return type() == SevenBitData;
120 }
121 
122 bool CharFreq::isSevenBitText() const {
123  return type() == SevenBitText;
124 }
125 
126 bool CharFreq::hasTrailingWhitespace() const {
127  return mTrailingWS;
128 }
129 
130 bool CharFreq::hasLeadingFrom() const {
131  return mLeadingFrom;
132 }
133 
134 CharFreq::Type CharFreq::type() const {
135 #if 0
136  tqDebug( "Total: %d; NUL: %d; CTL: %d;\n"
137  "CR: %d; LF: %d; CRLF: %d;\n"
138  "lineMin: %d; lineMax: %d;\n"
139  "printable: %d; eightBit: %d;\n"
140  "trailing whitespace: %s;\n"
141  "leading 'From ': %s;\n",
142  total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
143  printable, eightBit,
144  mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" );
145 #endif
146  if ( NUL ) // must be binary
147  return Binary;
148 
149  // doesn't contain NUL's:
150  if ( eightBit ) {
151  if ( lineMax > 988 ) return EightBitData; // not allowed in 8bit
152  if ( CR != CRLF || controlCodesRatio() > 0.2 ) return EightBitData;
153  return EightBitText;
154  }
155 
156  // doesn't contain NUL's, nor 8bit chars:
157  if ( lineMax > 988 ) return SevenBitData;
158  if ( CR != CRLF || controlCodesRatio() > 0.2 ) return SevenBitData;
159 
160  // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
161  return SevenBitText;
162 }
163 
164 float CharFreq::printableRatio() const {
165  if ( total ) return float(printable) / float(total);
166  else return 0;
167 }
168 
169 float CharFreq::controlCodesRatio() const {
170  if ( total ) return float(CTL) / float(total);
171  else return 0;
172 }
173 
174 } // namespace KMime
175 
176