diff --git a/kmime/kmime_charfreq.cpp b/kmime/kmime_charfreq.cpp index def2174b1..6880a2417 100644 --- a/kmime/kmime_charfreq.cpp +++ b/kmime/kmime_charfreq.cpp @@ -1,252 +1,252 @@ /* kmime_charfreq.cpp KMime, the KDE internet mail/usenet news message library. Copyright (c) 2001-2002 Marc Mutz This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /** @file This file is part of the API for handling MIME data and defines the CharFreq class. @brief Defines the CharFreq class. @authors Marc Mutz \ */ #include "kmime_charfreq.h" using namespace KMime; /** * Private class that helps to provide binary compatibility between releases. * @internal */ //@cond PRIVATE //class KMime::CharFreq::Private //{ // public: //}; //@endcond CharFreq::CharFreq( const QByteArray &buf ) : mNUL( 0 ), mCTL( 0 ), mCR( 0 ), mLF( 0 ), mCRLF( 0 ), mPrintable( 0 ), mEightBit( 0 ), mTotal( 0 ), mLineMin( 0xffffffff ), mLineMax( 0 ), mTrailingWS( false ), mLeadingFrom( false ) { if ( !buf.isEmpty() ) { count( buf.data(), buf.size() ); } } CharFreq::CharFreq( const char *buf, size_t len ) : mNUL( 0 ), mCTL( 0 ), mCR( 0 ), mLF( 0 ), mCRLF( 0 ), mPrintable( 0 ), mEightBit( 0 ), mTotal( 0 ), mLineMin( 0xffffffff ), mLineMax( 0 ), mTrailingWS( false ), mLeadingFrom( false ) { if ( buf && len > 0 ) { count( buf, len ); } } //@cond PRIVATE static inline bool isWS( char ch ) { return ( ch == '\t' || ch == ' ' ); } //@endcond void CharFreq::count( const char *it, size_t len ) { const char *end = it + len; uint currentLineLength = 0; // initialize the prevChar with LF so that From_ detection works w/o // special-casing: char prevChar = '\n'; char prevPrevChar = 0; for ( ; it != end ; ++it ) { ++currentLineLength; switch ( *it ) { case '\0': ++mNUL; break; case '\r': ++mCR; break; case '\n': ++mLF; if ( prevChar == '\r' ) { --currentLineLength; ++mCRLF; } if ( currentLineLength >= mLineMax ) { mLineMax = currentLineLength-1; } if ( currentLineLength <= mLineMin ) { mLineMin = currentLineLength-1; } if ( !mTrailingWS ) { if ( isWS( prevChar ) || ( prevChar == '\r' && isWS( prevPrevChar ) ) ) { mTrailingWS = true; } } currentLineLength = 0; break; case 'F': // check for lines starting with From_ if not found already: if ( !mLeadingFrom ) { if ( prevChar == '\n' && end - it >= 5 && !qstrncmp( "From ", it, 5 ) ) { mLeadingFrom = true; } } ++mPrintable; break; default: { uchar c = *it; if ( c == '\t' || ( c >= ' ' && c <= '~' ) ) { ++mPrintable; } else if ( c == 127 || c < ' ' ) { ++mCTL; } else { ++mEightBit; } } } prevPrevChar = prevChar; prevChar = *it; } // consider the length of the last line if ( currentLineLength >= mLineMax ) { mLineMax = currentLineLength; } if ( currentLineLength <= mLineMin ) { mLineMin = currentLineLength; } // check whether the last character is tab or space if ( isWS( prevChar ) ) { mTrailingWS = true; } mTotal = len; } bool CharFreq::isEightBitData() const { return type() == EightBitData; } bool CharFreq::isEightBitText() const { return type() == EightBitText; } bool CharFreq::isSevenBitData() const { return type() == SevenBitData; } bool CharFreq::isSevenBitText() const { return type() == SevenBitText; } bool CharFreq::hasTrailingWhitespace() const { return mTrailingWS; } bool CharFreq::hasLeadingFrom() const { return mLeadingFrom; } CharFreq::Type CharFreq::type() const { #if 0 qDebug( "Total: %d; NUL: %d; CTL: %d;\n" "CR: %d; LF: %d; CRLF: %d;\n" "lineMin: %d; lineMax: %d;\n" "printable: %d; eightBit: %d;\n" "trailing whitespace: %s;\n" "leading 'From ': %s;\n", total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax, printable, eightBit, mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" ); #endif if ( mNUL ) { // must be binary return Binary; } // doesn't contain NUL's: if ( mEightBit ) { if ( mLineMax > 988 ) { return EightBitData; // not allowed in 8bit } - if ( mCR != mCRLF || controlCodesRatio() > 0.2 ) { + if ( mLF != mCRLF || mCR != mCRLF || controlCodesRatio() > 0.2 ) { return EightBitData; } return EightBitText; } // doesn't contain NUL's, nor 8bit chars: if ( mLineMax > 988 ) { return SevenBitData; } - if ( mCR != mCRLF || controlCodesRatio() > 0.2 ) { + if ( mLF != mCRLF || mCR != mCRLF || controlCodesRatio() > 0.2 ) { return SevenBitData; } // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars: return SevenBitText; } float CharFreq::printableRatio() const { if ( mTotal ) { return float(mPrintable) / float(mTotal); } else { return 0; } } float CharFreq::controlCodesRatio() const { if ( mTotal ) { return float(mCTL) / float(mTotal); } else { return 0; } } diff --git a/kmime/kmime_charfreq.h b/kmime/kmime_charfreq.h index af2fc0b35..d5fde8c63 100644 --- a/kmime/kmime_charfreq.h +++ b/kmime/kmime_charfreq.h @@ -1,179 +1,184 @@ /* -*- c++ -*- kmime_charfreq.h KMime, the KDE internet mail/usenet news message library. Copyright (c) 2001-2002 Marc Mutz This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /** @file This file is part of the API for handling @ref MIME data and defines the CharFreq class. @brief Defines the CharFreq class. @authors Marc Mutz \ @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit: - Data that contains bytes with at least one value greater than 127. + Data that contains bytes with at least one value greater than 127, or at + least one NUL byte. @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary: - Eight-bit data that contains a high percentage of non-ascii values. + Eight-bit data that contains a high percentage of non-ascii values, + or lines longer than 998 characters, or stray CRs or LFs, or NULs. @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text: - Eight-bit data that contains a high percentage of non-ascii values. + Eight-bit data that contains a high percentage of ascii values, + no lines longer than 998 characters, no stray CRs or LFs, and no NULs. @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit: - Data that contains bytes with all values less than 128. + Data that contains bytes with all values less than 128, and no NULs. @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary: - Seven-bit data that contains a high percentage of non-ascii values. + Seven-bit data that contains a high percentage of non-ascii values, + or lines longer than 998 characters, or stray CRs or LFs. @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text: - Seven-bit data that contains a high percentage of ascii values. + Seven-bit data that contains a high percentage of ascii values, + no lines longer than 998 characters, no stray CRs or LFs. */ #ifndef __KMIME_CHARFREQ_H__ #define __KMIME_CHARFREQ_H__ #include #include "kmime_export.h" #undef None namespace KMime { /** @brief A class for performing basic data typing using frequency count heuristics. This class performs character frequency counts on the provided data which are used in heuristics to determine a basic data type. The data types are: - @ref Eight-Bit-Binary - @ref Eight-Bit-Text - @ref Seven-Bit-Binary - @ref Seven-Bit-Text */ class KMIME_EXPORT CharFreq { public: /** Constructs a Character Frequency instance for a buffer @p buf of QByteArray data. @param buf is a QByteArray containing the data. */ explicit CharFreq( const QByteArray &buf ); /** Constructs a Character Frequency instance for a buffer @p buf of chars of length @p len. @param buf is a pointer to a character string containing the data. @param len is the length of @p buf, in characters. */ CharFreq( const char *buf, size_t len ); /** The different types of data. */ enum Type { None = 0, /**< Unknown */ EightBitData, /**< 8bit binary */ Binary = EightBitData, /**< 8bit binary */ SevenBitData, /**< 7bit binary */ EightBitText, /**< 8bit text */ SevenBitText /**< 7bit text */ }; /** Returns the data #Type as derived from the class heuristics. */ Type type() const; /** Returns true if the data #Type is EightBitData; false otherwise. */ bool isEightBitData() const; /** Returns true if the data #Type is EightBitText; false otherwise. */ bool isEightBitText() const; /** Returns true if the data #Type is SevenBitData; false otherwise. */ bool isSevenBitData() const; /** Returns true if the data #Type is SevenBitText; false otherwise. */ bool isSevenBitText() const; /** Returns true if the data contains trailing whitespace. i.e., if any line ends with space (' ') or tab ('\\t'). */ bool hasTrailingWhitespace() const; /** Returns true if the data contains a line that starts with "From ". */ bool hasLeadingFrom() const; /** Returns the percentage of printable characters in the data. The result is undefined if the number of data characters is zero. */ float printableRatio() const; /** Returns the percentage of control code characters (CTLs) in the data. The result is undefined if the number of data characters is zero. */ float controlCodesRatio() const; private: //@cond PRIVATE uint mNUL; // count of NUL chars uint mCTL; // count of CTLs (incl. DEL, excl. CR, LF, HT) uint mCR; // count of CR chars uint mLF; // count of LF chars uint mCRLF; // count of LFs, preceded by CRs uint mPrintable; // count of printable US-ASCII chars (SPC..~) uint mEightBit; // count of other latin1 chars (those with 8th bit set) uint mTotal; // count of all chars uint mLineMin; // minimum line length uint mLineMax; // maximum line length bool mTrailingWS; // does the buffer contain trailing whitespace? bool mLeadingFrom; // does the buffer contain lines starting with "From "? //@endcond /** Performs the character frequency counts on the data. @param buf is a pointer to a character string containing the data. @param len is the length of @p buf, in characters. */ void count( const char *buf, size_t len ); }; } // namespace KMime #endif /* __KMIME_CHARFREQ_H__ */ diff --git a/kmime/tests/CMakeLists.txt b/kmime/tests/CMakeLists.txt index c1e00ac76..a2e69cdb1 100644 --- a/kmime/tests/CMakeLists.txt +++ b/kmime/tests/CMakeLists.txt @@ -1,71 +1,72 @@ set( EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR} ) include_directories(${CMAKE_SOURCE_DIR}/kmime) # convenience macro to add libkmime qtestlib qtgui unit-tests macro(add_kmime_test _source) set(_test ${_source}) get_filename_component(_name ${_source} NAME_WE) kde4_add_unit_test(${_name} TESTNAME kmime-${_name} ${_test}) target_link_libraries(${_name} kmime ${QT_QTTEST_LIBRARY} ${QT_QTGUI_LIBRARY} ${QT_QTCORE_LIBRARY} ${KDE4_KDEUI_LIBS} ) endmacro(add_kmime_test) ########### next target ############### if(HAVE_GETOPT_H) set(test_kmime_header_parsing_SRCS test_kmime_header_parsing.cpp ) kde4_add_executable(test_kmime_header_parsing TEST ${test_kmime_header_parsing_SRCS}) target_link_libraries(test_kmime_header_parsing kmime ${KDE4_KDECORE_LIBS} ) endif(HAVE_GETOPT_H) ########### next target ############### set(test_charfreq_SRCS test_charfreq.cpp ) kde4_add_executable(test_charfreq TEST ${test_charfreq_SRCS}) target_link_libraries(test_charfreq kmime ${KDE4_KDECORE_LIBS} ) ########### next target ############### if(HAVE_GETOPT_H) set(test_mdn_SRCS test_mdn.cpp ) kde4_add_executable(test_mdn TEST ${test_mdn_SRCS}) target_link_libraries(test_mdn kmime ${KDE4_KDECORE_LIBS} ) endif(HAVE_GETOPT_H) ########### next target ############### set(test_dates_SRCS test_dates.cpp ) kde4_add_executable(test_dates TEST ${test_dates_SRCS}) target_link_libraries(test_dates kmime ${KDE4_KDECORE_LIBS} ) ########### next target ############### if(HAVE_GETOPT_H) set(test_kmime_codec_SRCS test_kmime_codec.cpp ) kde4_add_executable(test_kmime_codec TEST ${test_kmime_codec_SRCS}) target_link_libraries(test_kmime_codec kmime ${KDE4_KDECORE_LIBS} ) endif(HAVE_GETOPT_H) # qtestlib unit tests add_kmime_test(rfc2047test.cpp) add_kmime_test(kmime_util_test.cpp) add_kmime_test(contentindextest.cpp) +add_kmime_test(kmime_charfreq_test.cpp) add_kmime_test(kmime_content_test.cpp) add_kmime_test(headertest.cpp) add_kmime_test(kmime_message_test.cpp) diff --git a/kmime/tests/kmime_charfreq_test.cpp b/kmime/tests/kmime_charfreq_test.cpp new file mode 100644 index 000000000..fc6d9299b --- /dev/null +++ b/kmime/tests/kmime_charfreq_test.cpp @@ -0,0 +1,154 @@ +/* + Copyright (c) 2009 Constantin Berzan + + This library is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to the + Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. +*/ + +#include "kmime_charfreq_test.h" +#include + +#include + +#include +using namespace KMime; + +QTEST_KDEMAIN( KMimeCharFreqTest, NoGUI ) + +void KMimeCharFreqTest::test8bitData() +{ + { + // If it has NUL then it's Binary (equivalent to EightBitData in CharFreq). + QByteArray data( "123" ); + data += char( 0 ); + data += "test"; + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::Binary ); + } + + { + // If it has lines longer than 998, it's EightBitData. + QByteArray data; + for( int i = 0; i < 999; i++ ) { + data += char( 169 ); + } + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::EightBitData ); + } + + { + // If #CR != #CRLF then it's EightBitData. + QByteArray data( "©line1\r\nline2\r" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::EightBitData ); + } + + { + // If #LF != #CRLF then it's EightBitData. + QByteArray data( "©line1\r\nline2\n" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::EightBitData ); + } + + { + // If it has a lot of control chars, it's EightBitData. + QByteArray data( "©test\a\a\a\a\a\a\a" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::EightBitData ); + } +} + +void KMimeCharFreqTest::test8bitText() +{ + { + // If it has no NULs, few CTLs, no stray CRs or LFs, it's EightBitText. + QByteArray data( "©beware the beast but enjoy the feast he offers...\r\n" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::EightBitText ); + } +} + +void KMimeCharFreqTest::test7bitData() +{ + { + // If it has lines longer than 998, it's SevenBitData. + QByteArray data; + for( int i = 0; i < 999; i++ ) { + data += 'a'; + } + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::SevenBitData ); + } + + { + // If #CR != #CRLF then it's SevenBitData. + QByteArray data( "line1\r\nline2\r" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::SevenBitData ); + } + + { + // If #LF != #CRLF then it's SevenBitData. + QByteArray data( "line1\r\nline2\n" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::SevenBitData ); + } + + { + // If it has a lot of control chars, it's SevenBitData. + QByteArray data( "test\a\a\a\a\a\a\a" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::SevenBitData ); + } +} + +void KMimeCharFreqTest::test7bitText() +{ + { + // If it has no NULs, few CTLs, no stray CRs or LFs, it's SevenBitText. + QByteArray data( "beware the beast but enjoy the feast he offers...\r\n" ); + kDebug() << data; + CharFreq cf( data ); + QCOMPARE( cf.type(), CharFreq::SevenBitText ); + } +} + +void KMimeCharFreqTest::testTrailingWhitespace() +{ + QByteArray data( "test " ); + kDebug() << data; + CharFreq cf( data ); + QVERIFY( cf.hasTrailingWhitespace() ); +} + +void KMimeCharFreqTest::testLeadingFrom() +{ + QByteArray data( "From here thither" ); + kDebug() << data; + CharFreq cf( data ); + QVERIFY( cf.hasLeadingFrom() ); +} + +#include "kmime_charfreq_test.moc" diff --git a/kmime/tests/kmime_charfreq_test.h b/kmime/tests/kmime_charfreq_test.h new file mode 100644 index 000000000..dae220be3 --- /dev/null +++ b/kmime/tests/kmime_charfreq_test.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2009 Constantin Berzan + + This library is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to the + Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. +*/ +#ifndef KMIME_CHARSET_TEST_H +#define KMIME_CHARSET_TEST_H + +#include + +class KMimeCharFreqTest : public QObject +{ + Q_OBJECT + private Q_SLOTS: + void test8bitData(); + void test8bitText(); + void test7bitData(); + void test7bitText(); + void testTrailingWhitespace(); + void testLeadingFrom(); +}; + +#endif