00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "kencodingdetector.h"
00029
00030 #undef DECODE_DEBUG
00031
00032
00033 #define MAX_BUFFER 16*1024
00034
00035 #include <assert.h>
00036
00037 #include "guess_ja_p.h"
00038
00039 #include <QRegExp>
00040 #include <QTextCodec>
00041
00042 #include <kglobal.h>
00043 #include <kcharsets.h>
00044 #include <kdebug.h>
00045 #include <klocale.h>
00046
00047 #include <ctype.h>
00048
00049 enum MIB
00050 {
00051 MibLatin1 = 4,
00052 Mib8859_8 = 85,
00053 MibUtf8 = 106,
00054 MibUcs2 = 1000,
00055 MibUtf16 = 1015,
00056 MibUtf16BE = 1013,
00057 MibUtf16LE = 1014
00058 };
00059
00060 static bool is16Bit(QTextCodec* codec)
00061 {
00062 switch (codec->mibEnum())
00063 {
00064 case MibUtf16:
00065 case MibUtf16BE:
00066 case MibUtf16LE:
00067 case MibUcs2:
00068 return true;
00069 default:
00070 return false;
00071 }
00072 }
00073
00074 class KEncodingDetectorPrivate
00075 {
00076 public:
00077 QTextCodec *m_codec;
00078 QTextDecoder *m_decoder;
00079 QTextCodec *m_defaultCodec;
00080 QByteArray m_storeDecoderName;
00081
00082 KEncodingDetector::EncodingChoiceSource m_source;
00083 KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
00084
00085 bool m_visualRTL : 1;
00086 bool m_seenBody : 1;
00087 bool m_writtingHappened : 1;
00088 bool m_analyzeCalled : 1;
00089 int m_multiByte;
00090
00091 QByteArray m_bufferForDefferedEncDetection;
00092
00093 KEncodingDetectorPrivate()
00094 : m_codec(QTextCodec::codecForMib(MibLatin1))
00095 , m_decoder(m_codec->makeDecoder())
00096 , m_defaultCodec(m_codec)
00097 , m_source(KEncodingDetector::DefaultEncoding)
00098 , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
00099 , m_visualRTL(false)
00100 , m_seenBody(false)
00101 , m_writtingHappened(false)
00102 , m_analyzeCalled(false)
00103 , m_multiByte(0)
00104 {
00105 }
00106
00107 KEncodingDetectorPrivate(QTextCodec* codec,KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
00108 : m_codec(codec)
00109 , m_decoder(m_codec->makeDecoder())
00110 , m_defaultCodec(m_codec)
00111 , m_source(source)
00112 , m_autoDetectLanguage(script)
00113 , m_visualRTL(false)
00114 , m_seenBody(false)
00115 , m_writtingHappened(false)
00116 , m_analyzeCalled(false)
00117 , m_multiByte(0)
00118 {
00119 }
00120
00121 ~KEncodingDetectorPrivate()
00122 {
00123 delete m_decoder;
00124 }
00125
00126
00127 bool isExplicitlySpecifiedEncoding()
00128 {
00129 return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
00130 }
00131 };
00132
00133
00134 static QByteArray automaticDetectionForArabic( const unsigned char* ptr, int size )
00135 {
00136 for ( int i = 0; i < size; ++i ) {
00137 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
00138 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
00139 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
00140 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
00141 return "cp1256";
00142 }
00143 }
00144
00145 return "iso-8859-6";
00146 }
00147
00148 static QByteArray automaticDetectionForBaltic( const unsigned char* ptr, int size )
00149 {
00150 for ( int i = 0; i < size; ++i ) {
00151 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
00152 return "cp1257";
00153
00154 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
00155 return "iso-8859-13";
00156 }
00157
00158 return "iso-8859-13";
00159 }
00160
00161 static QByteArray automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
00162 {
00163 QByteArray charset = QByteArray();
00164 for ( int i = 0; i < size; ++i ) {
00165 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
00166 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
00167 return "ibm852";
00168
00169 if ( i + 1 > size )
00170 return "cp1250";
00171 else {
00172 charset = "cp1250";
00173 continue;
00174 }
00175 }
00176 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
00177 if ( i + 1 > size )
00178 return "iso-8859-2";
00179 else {
00180 if ( charset.isNull() )
00181 charset = "iso-8859-2";
00182 continue;
00183 }
00184 }
00185 }
00186
00187 if ( charset.isNull() )
00188 charset = "iso-8859-3";
00189
00190 return charset.data();
00191 }
00192
00193 static QByteArray automaticDetectionForCyrillic( const unsigned char* ptr, int size)
00194 {
00195 #ifdef DECODE_DEBUG
00196 kWarning() << "KEncodingDetector: Cyr heuristics";
00197 #endif
00198
00199
00200
00201 int utf8_mark=0;
00202 int koi_score=0;
00203 int cp1251_score=0;
00204
00205 int koi_st=0;
00206 int cp1251_st=0;
00207
00208
00209
00210
00211 int koi_o_capital=0;
00212 int koi_o=0;
00213 int cp1251_o_capital=0;
00214 int cp1251_o=0;
00215
00216 int koi_a_capital=0;
00217 int koi_a=0;
00218 int cp1251_a_capital=0;
00219 int cp1251_a=0;
00220
00221 int koi_s_capital=0;
00222 int koi_s=0;
00223 int cp1251_s_capital=0;
00224 int cp1251_s=0;
00225
00226 int koi_i_capital=0;
00227 int koi_i=0;
00228 int cp1251_i_capital=0;
00229 int cp1251_i=0;
00230
00231 int cp1251_small_range=0;
00232 int koi_small_range=0;
00233 int ibm866_small_range=0;
00234
00235 int i;
00236 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
00237 {
00238 if (ptr[i]>0xdf)
00239 {
00240 ++cp1251_small_range;
00241
00242 if (ptr[i]==0xee)
00243 ++cp1251_o;
00244 else if (ptr[i]==0xe0)
00245 ++cp1251_a;
00246 else if (ptr[i]==0xe8)
00247 ++cp1251_i;
00248 else if (ptr[i]==0xf1)
00249 ++cp1251_s;
00250 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
00251 ++cp1251_st;
00252
00253 else if (ptr[i]==0xef)
00254 ++koi_o_capital;
00255 else if (ptr[i]==0xe1)
00256 ++koi_a_capital;
00257 else if (ptr[i]==0xe9)
00258 ++koi_i_capital;
00259 else if (ptr[i]==0xf3)
00260 ++koi_s_capital;
00261
00262 }
00263 else if (ptr[i]>0xbf)
00264 {
00265 ++koi_small_range;
00266
00267 if (ptr[i]==0xd0||ptr[i]==0xd1)
00268 ++utf8_mark;
00269 else if (ptr[i]==0xcf)
00270 ++koi_o;
00271 else if (ptr[i]==0xc1)
00272 ++koi_a;
00273 else if (ptr[i]==0xc9)
00274 ++koi_i;
00275 else if (ptr[i]==0xd3)
00276 ++koi_s;
00277 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
00278 ++koi_st;
00279
00280 else if (ptr[i]==0xce)
00281 ++cp1251_o_capital;
00282 else if (ptr[i]==0xc0)
00283 ++cp1251_a_capital;
00284 else if (ptr[i]==0xc8)
00285 ++cp1251_i_capital;
00286 else if (ptr[i]==0xd1)
00287 ++cp1251_s_capital;
00288 }
00289 else if (ptr[i]>0x9f && ptr[i]<0xb0)
00290 ++ibm866_small_range;
00291
00292 }
00293
00294
00295 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
00296 {
00297 return "";
00298 }
00299
00300 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
00301 {
00302 #ifdef DECODE_DEBUG
00303 kWarning() << "Cyr Enc Detection: UTF8";
00304 #endif
00305 return "UTF-8";
00306 }
00307
00308 if (ibm866_small_range>cp1251_small_range+koi_small_range)
00309 return "ibm866";
00310
00311
00312
00313
00314 if (cp1251_st==0 && koi_st>1)
00315 koi_score+=10;
00316 else if (koi_st==0 && cp1251_st>1)
00317 cp1251_score+=10;
00318
00319 if (cp1251_st && koi_st)
00320 {
00321 if (cp1251_st/koi_st>2)
00322 cp1251_score+=20;
00323 else if (koi_st/cp1251_st>2)
00324 koi_score+=20;
00325 }
00326
00327 if (cp1251_a>koi_a)
00328 cp1251_score+=10;
00329 else if (cp1251_a || koi_a)
00330 koi_score+=10;
00331
00332 if (cp1251_o>koi_o)
00333 cp1251_score+=10;
00334 else if (cp1251_o || koi_o)
00335 koi_score+=10;
00336
00337 if (cp1251_i>koi_i)
00338 cp1251_score+=10;
00339 else if (cp1251_i || koi_i)
00340 koi_score+=10;
00341
00342 if (cp1251_s>koi_s)
00343 cp1251_score+=10;
00344 else if (cp1251_s || koi_s)
00345 koi_score+=10;
00346
00347 if (cp1251_a_capital>koi_a_capital)
00348 cp1251_score+=9;
00349 else if (cp1251_a_capital || koi_a_capital)
00350 koi_score+=9;
00351
00352 if (cp1251_o_capital>koi_o_capital)
00353 cp1251_score+=9;
00354 else if (cp1251_o_capital || koi_o_capital)
00355 koi_score+=9;
00356
00357 if (cp1251_i_capital>koi_i_capital)
00358 cp1251_score+=9;
00359 else if (cp1251_i_capital || koi_i_capital)
00360 koi_score+=9;
00361
00362 if (cp1251_s_capital>koi_s_capital)
00363 cp1251_score+=9;
00364 else if (cp1251_s_capital || koi_s_capital)
00365 koi_score+=9;
00366 #ifdef DECODE_DEBUG
00367 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
00368 #endif
00369 if (abs(koi_score-cp1251_score)<10)
00370 {
00371
00372 cp1251_score=cp1251_small_range;
00373 koi_score=koi_small_range;
00374 }
00375 if (cp1251_score>koi_score)
00376 return "cp1251";
00377 else
00378 return "koi8-u";
00379
00380
00381
00382
00383
00384
00385
00386
00387 }
00388
00389 static QByteArray automaticDetectionForGreek( const unsigned char* ptr, int size )
00390 {
00391 for ( int i = 0; i < size; ++i ) {
00392 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
00393 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
00394 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
00395 return "cp1253";
00396 }
00397 }
00398
00399 return "iso-8859-7";
00400 }
00401
00402 static QByteArray automaticDetectionForHebrew( const unsigned char* ptr, int size )
00403 {
00404 for ( int i = 0; i < size; ++i ) {
00405 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
00406 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
00407 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
00408 return "cp1255";
00409 }
00410
00411 if ( ptr[ i ] == 0xDF )
00412 return "iso-8859-8-i";
00413 }
00414
00415 return "iso-8859-8-i";
00416 }
00417
00418 static QByteArray automaticDetectionForJapanese( const unsigned char* ptr, int size )
00419 {
00420 JapaneseCode kc;
00421
00422 switch ( kc.guess_jp( (const char*)ptr, size ) ) {
00423 case JapaneseCode::JIS:
00424 return "jis7";
00425 case JapaneseCode::EUC:
00426 return "eucjp";
00427 case JapaneseCode::SJIS:
00428 return "sjis";
00429 case JapaneseCode::UTF8:
00430 return "utf8";
00431 default:
00432 break;
00433 }
00434
00435 return "";
00436 }
00437
00438 static QByteArray automaticDetectionForTurkish( const unsigned char* ptr, int size )
00439 {
00440 for ( int i = 0; i < size; ++i ) {
00441 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
00442 return "cp1254";
00443 }
00444 }
00445
00446 return "iso-8859-9";
00447 }
00448
00449 static QByteArray automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
00450 {
00451 --size;
00452 uint nonansi_count=0;
00453 for (int i=0; i<size; ++i)
00454 {
00455 if (ptr[i]>0x79)
00456 {
00457 ++nonansi_count;
00458 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
00459 {
00460 return "UTF-8";
00461 }
00462 if (ptr[i] >= 0x78 && ptr[i]<=0x9F )
00463 {
00464 return "cp1252";
00465 }
00466 }
00467
00468 }
00469
00470 if (nonansi_count>0)
00471 return "iso-8859-15";
00472
00473 return "";
00474 }
00475
00476
00477
00478 static void skipComment(const char *&ptr, const char *pEnd)
00479 {
00480 const char *p = ptr;
00481
00482 if (*p=='>')
00483 {
00484 p++;
00485 }
00486 else
00487 {
00488 while (p!=pEnd)
00489 {
00490 if (*p=='-')
00491 {
00492
00493 if (p[1]=='-' && p[2]=='>')
00494 {
00495 p += 3;
00496 break;
00497 }
00498
00499 if (p[1] == '-' && p[2] == '!' && p[3] == '>')
00500 {
00501 p += 4;
00502 break;
00503 }
00504 }
00505 p++;
00506 }
00507 }
00508 ptr=p;
00509 }
00510
00511
00512 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
00513 {
00514 int len = str.length();
00515 int pos = str.indexOf("encoding");
00516 if (pos == -1)
00517 return -1;
00518 pos += 8;
00519
00520
00521 while (pos<len && str[pos]<=' ')
00522 ++pos;
00523
00524
00525
00526 if (pos>=len || str[pos] != '=')
00527 return -1;
00528 ++pos;
00529
00530
00531 while (pos<len && str[pos]<=' ')
00532 ++pos;
00533
00534
00535 if (pos >= len)
00536 return -1;
00537
00538
00539 char quoteMark = str[pos];
00540 if (quoteMark != '"' && quoteMark != '\'')
00541 return -1;
00542 ++pos;
00543
00544
00545 int end=pos;
00546 while (end<len && str[end]!=quoteMark)
00547 ++end;
00548
00549 if (end>=len)
00550 return -1;
00551
00552 encodingLength = end-pos;
00553 return pos;
00554 }
00555
00556 bool KEncodingDetector::processNull(char *data, int len)
00557 {
00558 bool bin=false;
00559 if(is16Bit(d->m_codec))
00560 {
00561 for (int i=1; i < len; i+=2)
00562 {
00563 if ((data[i]=='\0') && (data[i-1]=='\0'))
00564 {
00565 bin=true;
00566 data[i]=' ';
00567 }
00568 }
00569 return bin;
00570 }
00571
00572 int i = len-1;
00573 while(--i>=0)
00574 {
00575 if(data[i]==0)
00576 {
00577 bin=true;
00578 data[i]=' ';
00579 }
00580 }
00581 return bin;
00582 }
00583
00584
00585 bool KEncodingDetector::errorsIfUtf8 (const char* data, int length)
00586 {
00587 if (d->m_codec->mibEnum()!=MibUtf8)
00588 return false;
00589
00590
00591
00592
00593
00594 static const unsigned char highest1Bits = 0x80;
00595 static const unsigned char highest2Bits = 0xC0;
00596 static const unsigned char highest3Bits = 0xE0;
00597 static const unsigned char highest4Bits = 0xF0;
00598 static const unsigned char highest5Bits = 0xF8;
00599
00600 for (int i=0; i<length; ++i)
00601 {
00602 unsigned char c = data[i];
00603
00604 if (d->m_multiByte>0)
00605 {
00606 if ((c & highest2Bits) == 0x80)
00607 {
00608 --(d->m_multiByte);
00609 continue;
00610 }
00611 #ifdef DECODE_DEBUG
00612 kWarning() << "EncDetector: Broken UTF8";
00613 #endif
00614 return true;
00615 }
00616
00617
00618 if ((c & highest1Bits) == 0x00)
00619 continue;
00620
00621
00622 if ((c & highest3Bits) == 0xC0)
00623 {
00624 d->m_multiByte = 1;
00625 continue;
00626 }
00627
00628
00629 if ((c & highest4Bits) == 0xE0)
00630 {
00631 d->m_multiByte = 2;
00632 continue;
00633 }
00634
00635
00636 if ((c & highest5Bits) == 0xF0)
00637 {
00638 d->m_multiByte = 3;
00639 continue;
00640 }
00641 #ifdef DECODE_DEBUG
00642 kWarning() << "EncDetector:_Broken UTF8";
00643 #endif
00644 return true;
00645 }
00646 return false;
00647 }
00648
00649
00650 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
00651 {
00652 }
00653
00654 KEncodingDetector::KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
00655 d(new KEncodingDetectorPrivate(codec,source,script))
00656 {
00657 }
00658
00659 KEncodingDetector::~KEncodingDetector()
00660 {
00661 delete d;
00662 }
00663
00664 void KEncodingDetector::setAutoDetectLanguage( KEncodingDetector::AutoDetectScript lang)
00665 {
00666 d->m_autoDetectLanguage=lang;
00667 }
00668 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
00669 {
00670 return d->m_autoDetectLanguage;
00671 }
00672
00673 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
00674 {
00675 return d->m_source;
00676 }
00677
00678 const char* KEncodingDetector::encoding() const
00679 {
00680 d->m_storeDecoderName = d->m_codec->name();
00681 return d->m_storeDecoderName.constData();
00682 }
00683
00684 bool KEncodingDetector::visuallyOrdered() const
00685 {
00686 return d->m_visualRTL;
00687 }
00688
00689
00690
00691
00692
00693
00694 QTextDecoder* KEncodingDetector::decoder()
00695 {
00696 return d->m_decoder;
00697 }
00698
00699 void KEncodingDetector::resetDecoder()
00700 {
00701 assert(d->m_defaultCodec);
00702 d->m_bufferForDefferedEncDetection.clear();
00703 d->m_writtingHappened = false;
00704 d->m_analyzeCalled = false;
00705 d->m_multiByte = 0;
00706 delete d->m_decoder;
00707 if (!d->m_codec)
00708 d->m_codec = d->m_defaultCodec;
00709 d->m_decoder = d->m_codec->makeDecoder();
00710 }
00711
00712 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
00713 {
00714 QTextCodec *codec;
00715 QByteArray enc(_encoding);
00716 if(enc.isEmpty())
00717 {
00718 if (type==DefaultEncoding)
00719 codec=d->m_defaultCodec;
00720 else
00721 return false;
00722 }
00723 else
00724 {
00725
00726
00727 enc = enc.toLower();
00728
00729 if(enc=="visual")
00730 enc="iso8859-8";
00731 bool b;
00732 codec = KGlobal::charsets()->codecForName(enc, b);
00733 if (!b)
00734 return false;
00735 }
00736
00737 if (d->m_codec->mibEnum()==codec->mibEnum())
00738 {
00739
00740
00741 d->m_source = type;
00742 return true;
00743 }
00744
00745 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
00746 {
00747
00748
00749 return false;
00750 }
00751
00752 if (codec->mibEnum() == Mib8859_8)
00753 {
00754
00755 codec = QTextCodec::codecForName("iso8859-8-i");
00756
00757
00758 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
00759 d->m_visualRTL = true;
00760 }
00761
00762 d->m_codec = codec;
00763 d->m_source = type;
00764 delete d->m_decoder;
00765 d->m_decoder = d->m_codec->makeDecoder();
00766 #ifdef DECODE_DEBUG
00767 kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name();
00768 #endif
00769 return true;
00770 }
00771
00772 QString KEncodingDetector::decode(const char *data, int len)
00773 {
00774 processNull(const_cast<char *>(data),len);
00775 if (!d->m_analyzeCalled)
00776 {
00777 analyze(data,len);
00778 d->m_analyzeCalled=true;
00779 }
00780
00781 return d->m_decoder->toUnicode(data,len);
00782 }
00783
00784 QString KEncodingDetector::decode(const QByteArray &data)
00785 {
00786 processNull(const_cast<char *>(data.data()),data.size());
00787 if (!d->m_analyzeCalled)
00788 {
00789 analyze(data.data(),data.size());
00790 d->m_analyzeCalled=true;
00791 }
00792
00793 return d->m_decoder->toUnicode(data);
00794 }
00795
00796 QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
00797 {
00798 #ifdef DECODE_DEBUG
00799 kWarning() << "KEncodingDetector: decoding "<<len<<" bytes";
00800 #endif
00801 if (d->m_writtingHappened)
00802 {
00803 #ifdef DECODE_DEBUG
00804 kWarning() << "KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name();
00805 #endif
00806 processNull(const_cast<char *>(data),len);
00807 return d->m_decoder->toUnicode(data, len);
00808 }
00809 else
00810 {
00811 if (d->m_bufferForDefferedEncDetection.isEmpty())
00812 {
00813
00814
00815 if (analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding()))
00816 {
00817 #ifdef DECODE_DEBUG
00818 kWarning() << "KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name();
00819 #endif
00820 processNull(const_cast<char *>(data),len);
00821 d->m_writtingHappened=true;
00822 return d->m_decoder->toUnicode(data, len);
00823 }
00824 else
00825 {
00826 #ifdef DECODE_DEBUG
00827 kWarning() << "KEncodingDetector: begin deffer";
00828 #endif
00829 d->m_bufferForDefferedEncDetection=data;
00830 }
00831 }
00832 else
00833 {
00834 d->m_bufferForDefferedEncDetection+=data;
00835
00836
00837 bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
00838 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
00839 d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER)
00840 {
00841 d->m_writtingHappened=true;
00842 d->m_bufferForDefferedEncDetection.replace('\0',' ');
00843 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
00844 d->m_bufferForDefferedEncDetection.clear();
00845 #ifdef DECODE_DEBUG
00846 kWarning() << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
00847 #endif
00848 return result;
00849 }
00850 }
00851 }
00852
00853 return QString();
00854 }
00855
00856 bool KEncodingDetector::decodedInvalidCharacters() const
00857 {
00858 return d->m_decoder ? d->m_decoder->hasFailure() : false;
00859 }
00860
00861 QString KEncodingDetector::flush()
00862 {
00863 if (d->m_bufferForDefferedEncDetection.isEmpty())
00864 return QString();
00865
00866 d->m_bufferForDefferedEncDetection.replace('\0',' ');
00867 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
00868 d->m_bufferForDefferedEncDetection.clear();
00869 #ifdef DECODE_DEBUG
00870 kWarning() << "KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<" bytes "<< d->m_codec->name();
00871 #endif
00872 return result;
00873 }
00874
00875 bool KEncodingDetector::analyze(const char *data, int len)
00876 {
00877
00878
00879
00880 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
00881 {
00882
00883 const uchar *udata = (const uchar *)data;
00884 uchar c1 = *udata++;
00885 uchar c2 = *udata++;
00886 uchar c3 = *udata++;
00887
00888
00889 const char *autoDetectedEncoding;
00890 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
00891 {
00892 autoDetectedEncoding = "UTF-16";
00893 }
00894 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
00895 {
00896 autoDetectedEncoding = "UTF-8";
00897 }
00898 else if (c1 == 0x00 || c2 == 0x00)
00899 {
00900 uchar c4 = *udata++;
00901 uchar c5 = *udata++;
00902 uchar c6 = *udata++;
00903 uchar c7 = *udata++;
00904 uchar c8 = *udata++;
00905 uchar c9 = *udata++;
00906 uchar c10 = *udata++;
00907
00908 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
00909 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
00910 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
00911 autoDetectedEncoding = "UTF-16";
00912 else
00913 autoDetectedEncoding = 0;
00914 }
00915 else
00916 {
00917 autoDetectedEncoding = 0;
00918 }
00919
00920
00921 if (autoDetectedEncoding != 0)
00922 {
00923 d->m_source = BOM;
00924 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
00925 assert(d->m_codec);
00926
00927 delete d->m_decoder;
00928 d->m_decoder = d->m_codec->makeDecoder();
00929 #ifdef DECODE_DEBUG
00930 kWarning() << "Detection by BOM";
00931 #endif
00932 if (is16Bit(d->m_codec) && c2==0x00)
00933 {
00934
00935 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
00936 d->m_decoder->toUnicode(reverseUtf16, 2);
00937 }
00938 return true;
00939 }
00940 }
00941
00942
00943 if (d->m_source==UserChosenEncoding)
00944 {
00945 #ifdef DECODE_DEBUG
00946 kWarning() << "KEncodingDetector: UserChosenEncoding exit ";
00947 #endif
00948
00949 if (errorsIfUtf8(data, len))
00950 setEncoding("",DefaultEncoding);
00951 return true;
00952 }
00953
00954
00955 if (d->m_source==EncodingFromHTTPHeader)
00956 return true;
00957
00958 if (!d->m_seenBody)
00959 {
00960
00961
00962
00963 const char *ptr = data;
00964 const char *pEnd = data+len;
00965
00966 while(ptr != pEnd)
00967 {
00968 if(*ptr!='<')
00969 {
00970 ++ptr;
00971 continue;
00972 }
00973 ++ptr;
00974
00975 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
00976 {
00977 ptr += 3;
00978 skipComment(ptr, pEnd);
00979 continue;
00980 }
00981
00982
00983 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
00984 {
00985 const char *end = ptr;
00986 while (*end != '>' && end < pEnd)
00987 end++;
00988 if (*end == '\0' || end == pEnd)
00989 break;
00990 QByteArray str(ptr, end - ptr);
00991 int length;
00992 int pos = findXMLEncoding(str, length);
00993
00994 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
00995 {
00996 return true;
00997 }
00998 }
00999
01000
01001 while (
01002 !(((*ptr >= 'a') && (*ptr <= 'z')) ||
01003 ((*ptr >= 'A') && (*ptr <= 'Z')))
01004 && ptr < pEnd
01005 )
01006 ++ptr;
01007
01008 char tmp[5];
01009 int length=0;
01010 const char* max=ptr+4;
01011 if (pEnd<max)
01012 max=pEnd;
01013 while (
01014 (((*ptr >= 'a') && (*ptr <= 'z')) ||
01015 ((*ptr >= 'A') && (*ptr <= 'Z')) ||
01016 ((*ptr >= '0') && (*ptr <= '9')))
01017 && ptr < max
01018 )
01019 {
01020 tmp[length] = tolower( *ptr );
01021 ++ptr;
01022 ++length;
01023 }
01024 tmp[length] = 0;
01025 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
01026 {
01027
01028 const char* end = ptr;
01029 while(*end != '>' && *end != '\0' && end<pEnd)
01030 end++;
01031
01032 QByteArray str( ptr, (end-ptr)+1);
01033 str = str.toLower();
01034 int pos=0;
01035
01036
01037 if( (pos = str.indexOf("charset")) == -1)
01038 continue;
01039 pos+=6;
01040
01041 if( (pos = str.indexOf("=", pos)) == -1)
01042 continue;
01043
01044
01045 ++pos;
01046
01047
01048 while (pos < (int)str.length() && str[pos] <= ' ')
01049 ++pos;
01050
01051
01052
01053 if (pos < (int)str.length() && str[pos] == '"')
01054 ++pos;
01055
01056 if ( pos == (int)str.length())
01057 continue;
01058
01059 int endpos = pos;
01060 while( endpos < str.length() &&
01061 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
01062 && str[endpos] != ';' && str[endpos] != '>') )
01063 ++endpos;
01064 #ifdef DECODE_DEBUG
01065 kDebug( 6005 ) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
01066 #endif
01067 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
01068 return true;
01069 }
01070 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
01071 {
01072 d->m_seenBody=true;
01073 break;
01074 }
01075 }
01076 }
01077
01078 if (len<20)
01079 return false;
01080
01081 #ifdef DECODE_DEBUG
01082 kDebug( 6005 ) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
01083 #endif
01084
01085 switch ( d->m_autoDetectLanguage)
01086 {
01087 case KEncodingDetector::Arabic:
01088 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01089
01090 case KEncodingDetector::Baltic:
01091 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01092
01093 case KEncodingDetector::CentralEuropean:
01094 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
01095 break;
01096 case KEncodingDetector::Cyrillic:
01097 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
01098
01099 case KEncodingDetector::Greek:
01100 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
01101
01102 case KEncodingDetector::Hebrew:
01103 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
01104
01105 case KEncodingDetector::Japanese:
01106 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
01107
01108 case KEncodingDetector::Turkish:
01109 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
01110
01111 case KEncodingDetector::WesternEuropean:
01112 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
01113 return true;
01114 else if (d->m_defaultCodec->mibEnum()==MibLatin1)
01115 {
01116 return setEncoding("iso-8859-15",AutoDetectedEncoding);
01117 }
01118 else
01119 {
01120 return setEncoding("",DefaultEncoding);
01121 }
01122
01123 case KEncodingDetector::SemiautomaticDetection:
01124 case KEncodingDetector::ChineseSimplified:
01125 case KEncodingDetector::ChineseTraditional:
01126 case KEncodingDetector::Korean:
01127 case KEncodingDetector::Thai:
01128 case KEncodingDetector::Unicode:
01129 case KEncodingDetector::NorthernSaami:
01130 case KEncodingDetector::SouthEasternEurope:
01131 case KEncodingDetector::None:
01132
01133
01134 break;
01135 }
01136
01137 return true;
01138 }
01139
01140
01141 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString& lang)
01142 {
01143 if (lang.isEmpty())
01144 return KEncodingDetector::None;
01145 else if (lang==i18nc("@item Text character set", "Unicode"))
01146 return KEncodingDetector::Unicode;
01147 else if (lang==i18nc("@item Text character set", "Cyrillic"))
01148 return KEncodingDetector::Cyrillic;
01149 else if (lang==i18nc("@item Text character set", "Western European"))
01150 return KEncodingDetector::WesternEuropean;
01151 else if (lang==i18nc("@item Text character set", "Central European"))
01152 return KEncodingDetector::CentralEuropean;
01153 else if (lang==i18nc("@item Text character set", "Greek"))
01154 return KEncodingDetector::Greek;
01155 else if (lang==i18nc("@item Text character set", "Hebrew"))
01156 return KEncodingDetector::Hebrew;
01157 else if (lang==i18nc("@item Text character set", "Turkish"))
01158 return KEncodingDetector::Turkish;
01159 else if (lang==i18nc("@item Text character set", "Japanese"))
01160 return KEncodingDetector::Japanese;
01161 else if (lang==i18nc("@item Text character set", "Baltic"))
01162 return KEncodingDetector::Baltic;
01163 else if (lang==i18nc("@item Text character set", "Arabic"))
01164 return KEncodingDetector::Arabic;
01165
01166 return KEncodingDetector::None;
01167 }
01168
01169 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
01170 {
01171 switch (script)
01172 {
01173 case KEncodingDetector::Arabic:
01174 return true;
01175 case KEncodingDetector::Baltic:
01176 return true;
01177 case KEncodingDetector::CentralEuropean:
01178 return true;
01179 case KEncodingDetector::Cyrillic:
01180 return true;
01181 case KEncodingDetector::Greek:
01182 return true;
01183 case KEncodingDetector::Hebrew:
01184 return true;
01185 case KEncodingDetector::Japanese:
01186 return true;
01187 case KEncodingDetector::Turkish:
01188 return true;
01189 case KEncodingDetector::WesternEuropean:
01190 return true;
01191 case KEncodingDetector::ChineseTraditional:
01192 return true;
01193 case KEncodingDetector::ChineseSimplified:
01194 return true;
01195 case KEncodingDetector::Unicode:
01196 return true;
01197 break;
01198 default:
01199 return false;
01200 }
01201 }
01202
01203 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
01204 {
01205 switch (script)
01206 {
01207 case KEncodingDetector::Arabic:
01208 return i18nc("@item Text character set", "Arabic");
01209 break;
01210 case KEncodingDetector::Baltic:
01211 return i18nc("@item Text character set", "Baltic");
01212 break;
01213 case KEncodingDetector::CentralEuropean:
01214 return i18nc("@item Text character set", "Central European");
01215 break;
01216 case KEncodingDetector::Cyrillic:
01217 return i18nc("@item Text character set", "Cyrillic");
01218 break;
01219 case KEncodingDetector::Greek:
01220 return i18nc("@item Text character set", "Greek");
01221 break;
01222 case KEncodingDetector::Hebrew:
01223 return i18nc("@item Text character set", "Hebrew");
01224 break;
01225 case KEncodingDetector::Japanese:
01226 return i18nc("@item Text character set", "Japanese");
01227 break;
01228 case KEncodingDetector::Turkish:
01229 return i18nc("@item Text character set", "Turkish");
01230 break;
01231 case KEncodingDetector::WesternEuropean:
01232 return i18nc("@item Text character set", "Western European");
01233 break;
01234 case KEncodingDetector::ChineseTraditional:
01235 return i18nc("@item Text character set", "Chinese Traditional");
01236 break;
01237 case KEncodingDetector::ChineseSimplified:
01238 return i18nc("@item Text character set", "Chinese Simplified");
01239 break;
01240 case KEncodingDetector::Korean:
01241 return i18nc("@item Text character set", "Korean");
01242 break;
01243 case KEncodingDetector::Thai:
01244 return i18nc("@item Text character set", "Thai");
01245 break;
01246 case KEncodingDetector::Unicode:
01247 return i18nc("@item Text character set", "Unicode");
01248 break;
01249
01250 default:
01251 return QString();
01252
01253 }
01254 }
01255
01256 #undef DECODE_DEBUG
01257