diff --git a/kmime/kmime_charfreq.h b/kmime/kmime_charfreq.h index 1990f2434..46cfc8140 100644 --- a/kmime/kmime_charfreq.h +++ b/kmime/kmime_charfreq.h @@ -1,184 +1,185 @@ /* -*- c++ -*- kmime_charfreq.h KMime, the KDE Internet mail/usenet news message library. Copyright (c) 2001-2002 Marc Mutz This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /** @file This file is part of the API for handling @ref MIME data and defines the CharFreq class. @brief Defines the CharFreq class. @authors Marc Mutz \ @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit: Data that contains bytes with at least one value greater than 127, or at least one NUL byte. @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary: Eight-bit data that contains a high percentage of non-ascii values, - or lines longer than 998 characters, or stray CRs or LFs, or NULs. + or lines longer than 998 characters, or stray CRs, or NULs. @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text: Eight-bit data that contains a high percentage of ascii values, - no lines longer than 998 characters, no stray CRs or LFs, and no NULs. + no lines longer than 998 characters, no NULs, and either only LFs or + only CRLFs. @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit: Data that contains bytes with all values less than 128, and no NULs. @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary: Seven-bit data that contains a high percentage of non-ascii values, - or lines longer than 998 characters, or stray CRs or LFs. + or lines longer than 998 characters, or stray CRs. @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text: Seven-bit data that contains a high percentage of ascii values, - no lines longer than 998 characters, no stray CRs or LFs. + no lines longer than 998 characters, and either only LFs, or only CRLFs. */ #ifndef __KMIME_CHARFREQ_H__ #define __KMIME_CHARFREQ_H__ #include #include "kmime_export.h" #undef None namespace KMime { /** @brief A class for performing basic data typing using frequency count heuristics. This class performs character frequency counts on the provided data which are used in heuristics to determine a basic data type. The data types are: - @ref Eight-Bit-Binary - @ref Eight-Bit-Text - @ref Seven-Bit-Binary - @ref Seven-Bit-Text */ class KMIME_EXPORT CharFreq { public: /** Constructs a Character Frequency instance for a buffer @p buf of QByteArray data. @param buf is a QByteArray containing the data. */ explicit CharFreq( const QByteArray &buf ); /** Constructs a Character Frequency instance for a buffer @p buf of chars of length @p len. @param buf is a pointer to a character string containing the data. @param len is the length of @p buf, in characters. */ CharFreq( const char *buf, size_t len ); /** The different types of data. */ enum Type { None = 0, /**< Unknown */ EightBitData, /**< 8bit binary */ Binary = EightBitData, /**< 8bit binary */ SevenBitData, /**< 7bit binary */ EightBitText, /**< 8bit text */ SevenBitText /**< 7bit text */ }; /** Returns the data #Type as derived from the class heuristics. */ Type type() const; /** Returns true if the data #Type is EightBitData; false otherwise. */ bool isEightBitData() const; /** Returns true if the data #Type is EightBitText; false otherwise. */ bool isEightBitText() const; /** Returns true if the data #Type is SevenBitData; false otherwise. */ bool isSevenBitData() const; /** Returns true if the data #Type is SevenBitText; false otherwise. */ bool isSevenBitText() const; /** Returns true if the data contains trailing whitespace. i.e., if any line ends with space (' ') or tab ('\\t'). */ bool hasTrailingWhitespace() const; /** Returns true if the data contains a line that starts with "From ". */ bool hasLeadingFrom() const; /** Returns the percentage of printable characters in the data. The result is undefined if the number of data characters is zero. */ float printableRatio() const; /** Returns the percentage of control code characters (CTLs) in the data. The result is undefined if the number of data characters is zero. */ float controlCodesRatio() const; private: //@cond PRIVATE uint mNUL; // count of NUL chars uint mCTL; // count of CTLs (incl. DEL, excl. CR, LF, HT) uint mCR; // count of CR chars uint mLF; // count of LF chars uint mCRLF; // count of LFs, preceded by CRs uint mPrintable; // count of printable US-ASCII chars (SPC..~) uint mEightBit; // count of other latin1 chars (those with 8th bit set) uint mTotal; // count of all chars uint mLineMin; // minimum line length uint mLineMax; // maximum line length bool mTrailingWS; // does the buffer contain trailing whitespace? bool mLeadingFrom; // does the buffer contain lines starting with "From "? //@endcond /** Performs the character frequency counts on the data. @param buf is a pointer to a character string containing the data. @param len is the length of @p buf, in characters. */ void count( const char *buf, size_t len ); }; } // namespace KMime #endif /* __KMIME_CHARFREQ_H__ */ diff --git a/kmime/tests/kmime_charfreq_test.cpp b/kmime/tests/kmime_charfreq_test.cpp index 9a0297a7d..9115b2b1d 100644 --- a/kmime/tests/kmime_charfreq_test.cpp +++ b/kmime/tests/kmime_charfreq_test.cpp @@ -1,155 +1,155 @@ /* Copyright (c) 2009 Constantin Berzan This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "kmime_charfreq_test.h" #include #include #include using namespace KMime; QTEST_KDEMAIN( KMimeCharFreqTest, NoGUI ) void KMimeCharFreqTest::test8bitData() { { // If it has NUL then it's Binary (equivalent to EightBitData in CharFreq). QByteArray data( "123" ); data += char( 0 ); data += "test"; CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::Binary ); } { // If it has lines longer than 998, it's EightBitData. QByteArray data; for( int i = 0; i < 999; i++ ) { data += char( 169 ); } CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::EightBitData ); } { // If #CR != #CRLF then it's EightBitData. QByteArray data( "©line1\r\nline2\r" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::EightBitData ); } { // If #LF != #CRLF then it's EightBitData. QByteArray data( "©line1\r\nline2\n" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::EightBitData ); } { // If it has a lot of control chars, it's EightBitData. QByteArray data( "©test\a\a\a\a\a\a\a" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::EightBitData ); } } void KMimeCharFreqTest::test8bitText() { { - // If it has no NULs, few CTLs, no stray CRs or LFs, it's EightBitText. + // If the text only contains newlines and some random accented chars, then it is EightBitText + QByteArray data( "asdfasdfasdfasdfasdfasdfäöü\n" ); + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::EightBitText ); + } + + { + // If it has no NULs, few CTLs, and only CRLFs, it's EightBitText. QByteArray data( "©beware the beast but enjoy the feast he offers...\r\n" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::EightBitText ); } } void KMimeCharFreqTest::test7bitData() { { // If it has lines longer than 998, it's SevenBitData. QByteArray data; for( int i = 0; i < 999; i++ ) { data += 'a'; } CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::SevenBitData ); } { // If #CR != #CRLF then it's SevenBitData. QByteArray data( "line1\r\nline2\r" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::SevenBitData ); } { // If #LF != #CRLF then it's SevenBitData. QByteArray data( "line1\r\nline2\n" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::SevenBitData ); } - { - // If the text only contains newlines, then it is SevenBitText - QByteArray data( "line1\nline2\n" ); - CharFreq cf( data ); - QCOMPARE( cf.type(), CharFreq::SevenBitText ); - } - - { - // If the text only contains newlines and some random accented chars, then it is EightBitText - QByteArray data( "asdfasdfasdfasdfasdfasdfäöü\n" ); - CharFreq cf( data ); - QCOMPARE( cf.type(), CharFreq::EightBitText ); - } - { // If it has a lot of control chars, it's SevenBitData. QByteArray data( "test\a\a\a\a\a\a\a" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::SevenBitData ); } } void KMimeCharFreqTest::test7bitText() { { - // If it has no NULs, few CTLs, no stray CRs or LFs, it's SevenBitText. + // If the text only contains newlines, then it is SevenBitText + QByteArray data( "line1\nline2\n" ); + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::SevenBitText ); + } + + { + // If it has no NULs, few CTLs, and only CRLFs, it's SevenBitText. QByteArray data( "beware the beast but enjoy the feast he offers...\r\n" ); CharFreq cf( data ); QCOMPARE( cf.type(), CharFreq::SevenBitText ); } } void KMimeCharFreqTest::testTrailingWhitespace() { QByteArray data( "test " ); CharFreq cf( data ); QVERIFY( cf.hasTrailingWhitespace() ); } void KMimeCharFreqTest::testLeadingFrom() { QByteArray data( "From here thither" ); CharFreq cf( data ); QVERIFY( cf.hasLeadingFrom() ); } #include "kmime_charfreq_test.moc"