Главная страница
    Top.Mail.Ru    Яндекс.Метрика
Форум: "Основная";
Текущий архив: 2003.11.13;
Скачать: [xml.tar.bz2];

Вниз

Autodetect Charset   Найти похожие ветки 

 
Andrey Klimov   (2003-10-31 18:28) [0]

Кто-нибудь знает алгоритм автоматического определения кодировки (koi8-r, win-1251, ....) ?

Как, например, АВТОМАТИЧЕСКИ узнать в какой кодировке строка " "вЙЪОЕc Ч УФЙМЕ жAоK" - кПОБc PЙДДeТУФТПМЕ, 22 ПЛФСВТС УЗП ЕБЩПВД"......

АВТОМАТИЧЕСКИ...


 
Reindeer Moss Eater   (2003-10-31 18:34) [1]

Статистический метод


 
vuk   (2003-10-31 19:45) [2]

Имеется в виду определение кодировки по частоте встречаемости символов (или пар символов) с определенными кодами.


 
Dimaxx   (2003-10-31 23:15) [3]

type
TCodePage = (cpWin1251, cp866, cpKOI8R);

PMap = ^TMap;
TMap = array [ #$80..#$FF] of Char;

function GetMap(CP: TCodePage): PMap;
{ должна возвращать указатель на таблицу перекодировки из CP в Windows1251 (nil для CP = cpWin1251) }
begin
GetMap:=nil;
end;

function DetermineRussian(Buf: PChar; Count: integer): TCodePage;
const ModelBigrams: array [0..33,0..33] of Byte = ({АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_Ё}
{А}(0,20,44,12,22,23,16,60,4,9,63,93,47,110,0,16,35,61,81,1,5,13,24,17,12,4,0,0,0,0,14,31,205,1),
{Б}(19,0,0,0,4,19,0,0,8,0,2,15,1,4,41,0,15,5,0,15,0,2,1,0,0,6,16,37,0,0,0,4,
{В}(97,0,1,0,2,57,0,5,40,0,4,25,2,23,78,2,8,28,4,12,0,1,0,0,8,1,0,40,1,0,0,5,106,3),
{Г}(13,0,0,0,9,5,0,0,15,0,1,17,1,2,96,0,24,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,8,0),
{Д}(63,0,9,1,2,71,1,0,35,0,3,16,2,22,50,2,19,9,2,25,0,2,1,0,1,0,1,9,4,0,1,5,
{Е}(4,14,15,34,56,22,13,14,2,34,39,77,73,150,6,9,101,64,81,1,0,15,5,12,10,6,0,0,0,0,3,4,235,1),
{Ж}(13,0,0,0,12,47,0,0,16,0,1,0,0,23,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,2,2),
{З}(76,2,11,3,11,4,1,0,7,0,2,4,11,24,17,0,6,1,0,8,0,0,0,0,0,0,0,16,6,0,1,4,1,
{И}(7,9,32,5,18,60,4,42,31,27,28,46,55,49,12,7,26,60,53,0,5,25,14,28,4,1,0,0,0,0,9,56,255,0),
{Й}(0,0,0,0,2,0,0,0,0,0,1,3,0,3,0,0,0,10,3,0,0,0,0,1,1,0,0,0,0,0,0,0,122,0),
{К}(92,0,3,0,0,7,2,1,39,0,0,27,0,14,110,0,18,5,35,18,0,0,11,0,0,0,0,0,0,0,0,0,5,5,0),
{Л}(85,1,0,2,1,70,6,0,85,0,5,3,0,9,67,1,0,9,0,15,0,0,0,2,0,0,0,9,66,0,15,43,
{М}(44,0,0,0,0,65,0,0,47,0,1,1,10,15,57,7,0,2,0,24,0,0,0,0,0,0,0,28,0,0,0,8,
{}(139,0,0,1,11,108,0,4,152,0,7,0,1,69,161,0,0,8,25,24,5,1,5,2,0,1,0,83,10,0,1,29,38,5),
{О}(0,72,139,76,74,32,32,19,12,52,21,93,68,72,7,34,93,102,98,1,2,6,6,19,15,2,0,0,0,1,4,9,252,2),
{П}(17,0,0,0,0,43,0,0,14,0,1,9,0,1,125,3,120,1,2,8,0,0,0,0,0,0,0,3,6,0,0,3,2,2),
{Р}(151,1,6,4,3,103,7,0,76,0,4,0,11,10,117,1,0,5,9,39,2,5,0,1,3,0,0,24,7,0,1,10,22,5),
{С}(24,1,21,0,3,39,0,0,33,0,56,41,11,15,58,30,5,30,183,16,0,4,1,4,1,0,0,8,25,0,1,50,41,2),
{Т}(83,0,43,0,3,87,0,0,71,0,9,3,2,26,180,0,55,33,1,23,1,0,1,4,0,0,0,20,78,0,0,5,82,4),
{У}(3,6,7,14,19,8,13,6,0,1,13,15,10,7,0,12,17,16,19,0,1,3,0,12,5,8,0,0,0,0,2,
{Ф}(4,0,0,0,0,4,0,0,11,0,0,1,0,0,9,0,3,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,2,0),
{Х}(9,0,2,0,0,2,0,0,5,0,0,1,0,5,26,0,4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,76,0),
{Ц}(5,0,0,0,0,16,0,0,48,0,1,0,0,0,4,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,3,0),
{Ч}(30,0,0,0,0,52,0,0,23,0,3,1,0,14,1,0,0,0,36,5,0,0,0,0,1,0,0,0,1,0,0,0,2,2),
{Ш}(13,0,0,0,0,28,0,0,17,0,4,4,0,4,3,0,0,0,1,3,0,0,0,0,0,0,0,0,3,0,0,0,1,1),
{Щ}(6,0,0,0,0,23,0,0,16,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1),
{Ъ}(0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0),
{Ы}(0,5,14,1,3,28,0,2,0,22,6,19,21,2,0,5,4,7,10,0,0,37,0,3,4,0,0,0,0,0,0,1,8,
{Ь}(0,1,0,0,0,9,0,10,1,0,13,0,2,26,0,0,0,10,3,0,0,0,1,0,6,0,0,0,0,0,6,4,117,0),
{Э}(0,0,0,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,31,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0),
{Ю}(0,5,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,1,15,0,0,0,1,4,1,15,0,0,0,0,0,0,38,0),
{Я}(0,0,9,2,7,10,3,19,0,0,1,6,7,8,0,0,2,6,19,0,0,3,5,1,0,3,0,0,0,0,5,2,177,0),
{_}(42,80,193,43,109,41,18,53,159,0,144,27,83,176,187,229,70,231,99,47,15,13,6,58,7,0,0,0,0,38,0,22,0,2),
{Ё}(0,0,0,0,3,0,0,0,0,0,2,4,4,8,0,0,5,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));

{ "рейтинг" буквы Ё условно принимается равным 1/20 от "рейтинга" буквы E, если сочетание с участием Ё корректно, иначе - 0 }

type
TVariation = array[0..33,0..33] of integer;

var I,J,iC,iPredC,Max: integer;
C: Char;
CP: TCodePage;
D,MinD,Factor: double;
AMap: PMap;
PV: ^TVariation;
Vars: array[TCodePage] of TVariation;
begin
DetermineRussian:=cpWin1251; { по yмолчанию }
{ вычисление распределений биграмм }
FillChar(Vars,sizeof(Vars),0);
for CP:=Low(Vars) to High(Vars) do
begin
AMap:=GetMap(CP);
PV:=@Vars[CP];
iPredC:=32;
for I:=0 to pred(Count) do
begin
C:=Buf[I];
iC:=32;
if C>=#128 then
begin
if AMap<>nil then C:=AMap^[C];
if not (C in ["Ё","ё"]) then
begin
C:=Chr(Ord(C) and not 32);
{"a".."я"->"А".."Я"}
if C in ["А".."Я"] then iC:=Ord(C)-Ord("А");
end
else iC:=33;
end;
Inc(PV^[iPredC,iC]);
iPredC:=iC;
end;
end;
{ вычисление метрики и определение наиболее правдоподобной кодировки }
MinD:=0;
for CP:=Low(Vars) to High(Vars) do
begin
PV:=@Vars[CP];
PV^[32,32]:=0;
Max:=1;
for I:=0 to 33 do
for J:=0 to 33 do if PV^[I,J]>Max then Max:=PV^[I,J];
Factor:=255/Max; { ноpмализация }
D:=0;
for I:=0 to 33 do
for J:=0 to 33 do D:=D+Abs(PV^[I,J]*Factor-ModelBigrams[I,J]);
if (MinD=0) or (D<MinD) then
begin
MinD:=D;
DetermineRussian:=CP;
end;
end;
end;

begin
{ тест: слово "Пример" в разных кодировках (веpоятность ошибок на таких коpотких текстах высока - в данном слyчае пpосто повезло!) }
writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251);
writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866);
writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R);
readln;
end.



Страницы: 1 вся ветка

Форум: "Основная";
Текущий архив: 2003.11.13;
Скачать: [xml.tar.bz2];

Наверх





Память: 0.47 MB
Время: 0.051 c
9-40763
higor
2003-05-05 07:54
2003.11.13
Изменение разрешения экрана в Delphi


1-41400
nazar
2003-10-30 00:26
2003.11.13
Подкажите installer который б мог инсталлировать IDAPI 2.5


6-41863
Maxim Volobuev
2003-09-06 18:38
2003.11.13
WebBrowser и Socks прокси


4-42273
misterix
2003-09-12 11:02
2003.11.13
Таскбар и трей.


1-41653
Endi
2003-10-25 16:52
2003.11.13
Как считать строки, прерывающиеся 0 байтом.





Afrikaans Albanian Arabic Armenian Azerbaijani Basque Belarusian Bulgarian Catalan Chinese (Simplified) Chinese (Traditional) Croatian Czech Danish Dutch English Estonian Filipino Finnish French
Galician Georgian German Greek Haitian Creole Hebrew Hindi Hungarian Icelandic Indonesian Irish Italian Japanese Korean Latvian Lithuanian Macedonian Malay Maltese Norwegian
Persian Polish Portuguese Romanian Russian Serbian Slovak Slovenian Spanish Swahili Swedish Thai Turkish Ukrainian Urdu Vietnamese Welsh Yiddish Bengali Bosnian
Cebuano Esperanto Gujarati Hausa Hmong Igbo Javanese Kannada Khmer Lao Latin Maori Marathi Mongolian Nepali Punjabi Somali Tamil Telugu Yoruba
Zulu
Английский Французский Немецкий Итальянский Португальский Русский Испанский