Главная страница
    Top.Mail.Ru    Яндекс.Метрика
Форум: "Основная";
Текущий архив: 2003.11.13;
Скачать: [xml.tar.bz2];

Вниз

Autodetect Charset   Найти похожие ветки 

 
Andrey Klimov   (2003-10-31 18:28) [0]

Кто-нибудь знает алгоритм автоматического определения кодировки (koi8-r, win-1251, ....) ?

Как, например, АВТОМАТИЧЕСКИ узнать в какой кодировке строка " "вЙЪОЕc Ч УФЙМЕ жAоK" - кПОБc PЙДДeТУФТПМЕ, 22 ПЛФСВТС УЗП ЕБЩПВД"......

АВТОМАТИЧЕСКИ...


 
Reindeer Moss Eater   (2003-10-31 18:34) [1]

Статистический метод


 
vuk   (2003-10-31 19:45) [2]

Имеется в виду определение кодировки по частоте встречаемости символов (или пар символов) с определенными кодами.


 
Dimaxx   (2003-10-31 23:15) [3]

type
TCodePage = (cpWin1251, cp866, cpKOI8R);

PMap = ^TMap;
TMap = array [ #$80..#$FF] of Char;

function GetMap(CP: TCodePage): PMap;
{ должна возвращать указатель на таблицу перекодировки из CP в Windows1251 (nil для CP = cpWin1251) }
begin
GetMap:=nil;
end;

function DetermineRussian(Buf: PChar; Count: integer): TCodePage;
const ModelBigrams: array [0..33,0..33] of Byte = ({АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_Ё}
{А}(0,20,44,12,22,23,16,60,4,9,63,93,47,110,0,16,35,61,81,1,5,13,24,17,12,4,0,0,0,0,14,31,205,1),
{Б}(19,0,0,0,4,19,0,0,8,0,2,15,1,4,41,0,15,5,0,15,0,2,1,0,0,6,16,37,0,0,0,4,
{В}(97,0,1,0,2,57,0,5,40,0,4,25,2,23,78,2,8,28,4,12,0,1,0,0,8,1,0,40,1,0,0,5,106,3),
{Г}(13,0,0,0,9,5,0,0,15,0,1,17,1,2,96,0,24,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,8,0),
{Д}(63,0,9,1,2,71,1,0,35,0,3,16,2,22,50,2,19,9,2,25,0,2,1,0,1,0,1,9,4,0,1,5,
{Е}(4,14,15,34,56,22,13,14,2,34,39,77,73,150,6,9,101,64,81,1,0,15,5,12,10,6,0,0,0,0,3,4,235,1),
{Ж}(13,0,0,0,12,47,0,0,16,0,1,0,0,23,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,2,2),
{З}(76,2,11,3,11,4,1,0,7,0,2,4,11,24,17,0,6,1,0,8,0,0,0,0,0,0,0,16,6,0,1,4,1,
{И}(7,9,32,5,18,60,4,42,31,27,28,46,55,49,12,7,26,60,53,0,5,25,14,28,4,1,0,0,0,0,9,56,255,0),
{Й}(0,0,0,0,2,0,0,0,0,0,1,3,0,3,0,0,0,10,3,0,0,0,0,1,1,0,0,0,0,0,0,0,122,0),
{К}(92,0,3,0,0,7,2,1,39,0,0,27,0,14,110,0,18,5,35,18,0,0,11,0,0,0,0,0,0,0,0,0,5,5,0),
{Л}(85,1,0,2,1,70,6,0,85,0,5,3,0,9,67,1,0,9,0,15,0,0,0,2,0,0,0,9,66,0,15,43,
{М}(44,0,0,0,0,65,0,0,47,0,1,1,10,15,57,7,0,2,0,24,0,0,0,0,0,0,0,28,0,0,0,8,
{}(139,0,0,1,11,108,0,4,152,0,7,0,1,69,161,0,0,8,25,24,5,1,5,2,0,1,0,83,10,0,1,29,38,5),
{О}(0,72,139,76,74,32,32,19,12,52,21,93,68,72,7,34,93,102,98,1,2,6,6,19,15,2,0,0,0,1,4,9,252,2),
{П}(17,0,0,0,0,43,0,0,14,0,1,9,0,1,125,3,120,1,2,8,0,0,0,0,0,0,0,3,6,0,0,3,2,2),
{Р}(151,1,6,4,3,103,7,0,76,0,4,0,11,10,117,1,0,5,9,39,2,5,0,1,3,0,0,24,7,0,1,10,22,5),
{С}(24,1,21,0,3,39,0,0,33,0,56,41,11,15,58,30,5,30,183,16,0,4,1,4,1,0,0,8,25,0,1,50,41,2),
{Т}(83,0,43,0,3,87,0,0,71,0,9,3,2,26,180,0,55,33,1,23,1,0,1,4,0,0,0,20,78,0,0,5,82,4),
{У}(3,6,7,14,19,8,13,6,0,1,13,15,10,7,0,12,17,16,19,0,1,3,0,12,5,8,0,0,0,0,2,
{Ф}(4,0,0,0,0,4,0,0,11,0,0,1,0,0,9,0,3,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,2,0),
{Х}(9,0,2,0,0,2,0,0,5,0,0,1,0,5,26,0,4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,76,0),
{Ц}(5,0,0,0,0,16,0,0,48,0,1,0,0,0,4,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,3,0),
{Ч}(30,0,0,0,0,52,0,0,23,0,3,1,0,14,1,0,0,0,36,5,0,0,0,0,1,0,0,0,1,0,0,0,2,2),
{Ш}(13,0,0,0,0,28,0,0,17,0,4,4,0,4,3,0,0,0,1,3,0,0,0,0,0,0,0,0,3,0,0,0,1,1),
{Щ}(6,0,0,0,0,23,0,0,16,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1),
{Ъ}(0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0),
{Ы}(0,5,14,1,3,28,0,2,0,22,6,19,21,2,0,5,4,7,10,0,0,37,0,3,4,0,0,0,0,0,0,1,8,
{Ь}(0,1,0,0,0,9,0,10,1,0,13,0,2,26,0,0,0,10,3,0,0,0,1,0,6,0,0,0,0,0,6,4,117,0),
{Э}(0,0,0,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,31,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0),
{Ю}(0,5,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,1,15,0,0,0,1,4,1,15,0,0,0,0,0,0,38,0),
{Я}(0,0,9,2,7,10,3,19,0,0,1,6,7,8,0,0,2,6,19,0,0,3,5,1,0,3,0,0,0,0,5,2,177,0),
{_}(42,80,193,43,109,41,18,53,159,0,144,27,83,176,187,229,70,231,99,47,15,13,6,58,7,0,0,0,0,38,0,22,0,2),
{Ё}(0,0,0,0,3,0,0,0,0,0,2,4,4,8,0,0,5,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));

{ "рейтинг" буквы Ё условно принимается равным 1/20 от "рейтинга" буквы E, если сочетание с участием Ё корректно, иначе - 0 }

type
TVariation = array[0..33,0..33] of integer;

var I,J,iC,iPredC,Max: integer;
C: Char;
CP: TCodePage;
D,MinD,Factor: double;
AMap: PMap;
PV: ^TVariation;
Vars: array[TCodePage] of TVariation;
begin
DetermineRussian:=cpWin1251; { по yмолчанию }
{ вычисление распределений биграмм }
FillChar(Vars,sizeof(Vars),0);
for CP:=Low(Vars) to High(Vars) do
begin
AMap:=GetMap(CP);
PV:=@Vars[CP];
iPredC:=32;
for I:=0 to pred(Count) do
begin
C:=Buf[I];
iC:=32;
if C>=#128 then
begin
if AMap<>nil then C:=AMap^[C];
if not (C in ["Ё","ё"]) then
begin
C:=Chr(Ord(C) and not 32);
{"a".."я"->"А".."Я"}
if C in ["А".."Я"] then iC:=Ord(C)-Ord("А");
end
else iC:=33;
end;
Inc(PV^[iPredC,iC]);
iPredC:=iC;
end;
end;
{ вычисление метрики и определение наиболее правдоподобной кодировки }
MinD:=0;
for CP:=Low(Vars) to High(Vars) do
begin
PV:=@Vars[CP];
PV^[32,32]:=0;
Max:=1;
for I:=0 to 33 do
for J:=0 to 33 do if PV^[I,J]>Max then Max:=PV^[I,J];
Factor:=255/Max; { ноpмализация }
D:=0;
for I:=0 to 33 do
for J:=0 to 33 do D:=D+Abs(PV^[I,J]*Factor-ModelBigrams[I,J]);
if (MinD=0) or (D<MinD) then
begin
MinD:=D;
DetermineRussian:=CP;
end;
end;
end;

begin
{ тест: слово "Пример" в разных кодировках (веpоятность ошибок на таких коpотких текстах высока - в данном слyчае пpосто повезло!) }
writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251);
writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866);
writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R);
readln;
end.



Страницы: 1 вся ветка

Форум: "Основная";
Текущий архив: 2003.11.13;
Скачать: [xml.tar.bz2];

Наверх




Память: 0.46 MB
Время: 0.034 c
1-41298
Илайдж
2003-10-30 08:44
2003.11.13
Что за компонент?


1-41614
vic_vm
2003-10-25 20:29
2003.11.13
Не могу перевести C++ код на Delphi


3-41044
uu
2003-10-24 15:46
2003.11.13
fastreport


1-41161
Yanval
2003-10-31 12:12
2003.11.13
title в TWebBroswer


1-41140
bon
2003-11-03 15:02
2003.11.13
Свойства





Afrikaans Albanian Arabic Armenian Azerbaijani Basque Belarusian Bulgarian Catalan Chinese (Simplified) Chinese (Traditional) Croatian Czech Danish Dutch English Estonian Filipino Finnish French
Galician Georgian German Greek Haitian Creole Hebrew Hindi Hungarian Icelandic Indonesian Irish Italian Japanese Korean Latvian Lithuanian Macedonian Malay Maltese Norwegian
Persian Polish Portuguese Romanian Russian Serbian Slovak Slovenian Spanish Swahili Swedish Thai Turkish Ukrainian Urdu Vietnamese Welsh Yiddish Bengali Bosnian
Cebuano Esperanto Gujarati Hausa Hmong Igbo Javanese Kannada Khmer Lao Latin Maori Marathi Mongolian Nepali Punjabi Somali Tamil Telugu Yoruba
Zulu
Английский Французский Немецкий Итальянский Португальский Русский Испанский