Главная страница
    Top.Mail.Ru    Яндекс.Метрика
Форум: "Основная";
Текущий архив: 2003.11.13;
Скачать: [xml.tar.bz2];

Вниз

Autodetect Charset   Найти похожие ветки 

 
Andrey Klimov   (2003-10-31 18:28) [0]

Кто-нибудь знает алгоритм автоматического определения кодировки (koi8-r, win-1251, ....) ?

Как, например, АВТОМАТИЧЕСКИ узнать в какой кодировке строка " "вЙЪОЕc Ч УФЙМЕ жAоK" - кПОБc PЙДДeТУФТПМЕ, 22 ПЛФСВТС УЗП ЕБЩПВД"......

АВТОМАТИЧЕСКИ...


 
Reindeer Moss Eater   (2003-10-31 18:34) [1]

Статистический метод


 
vuk   (2003-10-31 19:45) [2]

Имеется в виду определение кодировки по частоте встречаемости символов (или пар символов) с определенными кодами.


 
Dimaxx   (2003-10-31 23:15) [3]

type
TCodePage = (cpWin1251, cp866, cpKOI8R);

PMap = ^TMap;
TMap = array [ #$80..#$FF] of Char;

function GetMap(CP: TCodePage): PMap;
{ должна возвращать указатель на таблицу перекодировки из CP в Windows1251 (nil для CP = cpWin1251) }
begin
GetMap:=nil;
end;

function DetermineRussian(Buf: PChar; Count: integer): TCodePage;
const ModelBigrams: array [0..33,0..33] of Byte = ({АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_Ё}
{А}(0,20,44,12,22,23,16,60,4,9,63,93,47,110,0,16,35,61,81,1,5,13,24,17,12,4,0,0,0,0,14,31,205,1),
{Б}(19,0,0,0,4,19,0,0,8,0,2,15,1,4,41,0,15,5,0,15,0,2,1,0,0,6,16,37,0,0,0,4,
{В}(97,0,1,0,2,57,0,5,40,0,4,25,2,23,78,2,8,28,4,12,0,1,0,0,8,1,0,40,1,0,0,5,106,3),
{Г}(13,0,0,0,9,5,0,0,15,0,1,17,1,2,96,0,24,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,8,0),
{Д}(63,0,9,1,2,71,1,0,35,0,3,16,2,22,50,2,19,9,2,25,0,2,1,0,1,0,1,9,4,0,1,5,
{Е}(4,14,15,34,56,22,13,14,2,34,39,77,73,150,6,9,101,64,81,1,0,15,5,12,10,6,0,0,0,0,3,4,235,1),
{Ж}(13,0,0,0,12,47,0,0,16,0,1,0,0,23,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,2,2),
{З}(76,2,11,3,11,4,1,0,7,0,2,4,11,24,17,0,6,1,0,8,0,0,0,0,0,0,0,16,6,0,1,4,1,
{И}(7,9,32,5,18,60,4,42,31,27,28,46,55,49,12,7,26,60,53,0,5,25,14,28,4,1,0,0,0,0,9,56,255,0),
{Й}(0,0,0,0,2,0,0,0,0,0,1,3,0,3,0,0,0,10,3,0,0,0,0,1,1,0,0,0,0,0,0,0,122,0),
{К}(92,0,3,0,0,7,2,1,39,0,0,27,0,14,110,0,18,5,35,18,0,0,11,0,0,0,0,0,0,0,0,0,5,5,0),
{Л}(85,1,0,2,1,70,6,0,85,0,5,3,0,9,67,1,0,9,0,15,0,0,0,2,0,0,0,9,66,0,15,43,
{М}(44,0,0,0,0,65,0,0,47,0,1,1,10,15,57,7,0,2,0,24,0,0,0,0,0,0,0,28,0,0,0,8,
{}(139,0,0,1,11,108,0,4,152,0,7,0,1,69,161,0,0,8,25,24,5,1,5,2,0,1,0,83,10,0,1,29,38,5),
{О}(0,72,139,76,74,32,32,19,12,52,21,93,68,72,7,34,93,102,98,1,2,6,6,19,15,2,0,0,0,1,4,9,252,2),
{П}(17,0,0,0,0,43,0,0,14,0,1,9,0,1,125,3,120,1,2,8,0,0,0,0,0,0,0,3,6,0,0,3,2,2),
{Р}(151,1,6,4,3,103,7,0,76,0,4,0,11,10,117,1,0,5,9,39,2,5,0,1,3,0,0,24,7,0,1,10,22,5),
{С}(24,1,21,0,3,39,0,0,33,0,56,41,11,15,58,30,5,30,183,16,0,4,1,4,1,0,0,8,25,0,1,50,41,2),
{Т}(83,0,43,0,3,87,0,0,71,0,9,3,2,26,180,0,55,33,1,23,1,0,1,4,0,0,0,20,78,0,0,5,82,4),
{У}(3,6,7,14,19,8,13,6,0,1,13,15,10,7,0,12,17,16,19,0,1,3,0,12,5,8,0,0,0,0,2,
{Ф}(4,0,0,0,0,4,0,0,11,0,0,1,0,0,9,0,3,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,2,0),
{Х}(9,0,2,0,0,2,0,0,5,0,0,1,0,5,26,0,4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,76,0),
{Ц}(5,0,0,0,0,16,0,0,48,0,1,0,0,0,4,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,3,0),
{Ч}(30,0,0,0,0,52,0,0,23,0,3,1,0,14,1,0,0,0,36,5,0,0,0,0,1,0,0,0,1,0,0,0,2,2),
{Ш}(13,0,0,0,0,28,0,0,17,0,4,4,0,4,3,0,0,0,1,3,0,0,0,0,0,0,0,0,3,0,0,0,1,1),
{Щ}(6,0,0,0,0,23,0,0,16,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1),
{Ъ}(0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0),
{Ы}(0,5,14,1,3,28,0,2,0,22,6,19,21,2,0,5,4,7,10,0,0,37,0,3,4,0,0,0,0,0,0,1,8,
{Ь}(0,1,0,0,0,9,0,10,1,0,13,0,2,26,0,0,0,10,3,0,0,0,1,0,6,0,0,0,0,0,6,4,117,0),
{Э}(0,0,0,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,31,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0),
{Ю}(0,5,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,1,15,0,0,0,1,4,1,15,0,0,0,0,0,0,38,0),
{Я}(0,0,9,2,7,10,3,19,0,0,1,6,7,8,0,0,2,6,19,0,0,3,5,1,0,3,0,0,0,0,5,2,177,0),
{_}(42,80,193,43,109,41,18,53,159,0,144,27,83,176,187,229,70,231,99,47,15,13,6,58,7,0,0,0,0,38,0,22,0,2),
{Ё}(0,0,0,0,3,0,0,0,0,0,2,4,4,8,0,0,5,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));

{ "рейтинг" буквы Ё условно принимается равным 1/20 от "рейтинга" буквы E, если сочетание с участием Ё корректно, иначе - 0 }

type
TVariation = array[0..33,0..33] of integer;

var I,J,iC,iPredC,Max: integer;
C: Char;
CP: TCodePage;
D,MinD,Factor: double;
AMap: PMap;
PV: ^TVariation;
Vars: array[TCodePage] of TVariation;
begin
DetermineRussian:=cpWin1251; { по yмолчанию }
{ вычисление распределений биграмм }
FillChar(Vars,sizeof(Vars),0);
for CP:=Low(Vars) to High(Vars) do
begin
AMap:=GetMap(CP);
PV:=@Vars[CP];
iPredC:=32;
for I:=0 to pred(Count) do
begin
C:=Buf[I];
iC:=32;
if C>=#128 then
begin
if AMap<>nil then C:=AMap^[C];
if not (C in ["Ё","ё"]) then
begin
C:=Chr(Ord(C) and not 32);
{"a".."я"->"А".."Я"}
if C in ["А".."Я"] then iC:=Ord(C)-Ord("А");
end
else iC:=33;
end;
Inc(PV^[iPredC,iC]);
iPredC:=iC;
end;
end;
{ вычисление метрики и определение наиболее правдоподобной кодировки }
MinD:=0;
for CP:=Low(Vars) to High(Vars) do
begin
PV:=@Vars[CP];
PV^[32,32]:=0;
Max:=1;
for I:=0 to 33 do
for J:=0 to 33 do if PV^[I,J]>Max then Max:=PV^[I,J];
Factor:=255/Max; { ноpмализация }
D:=0;
for I:=0 to 33 do
for J:=0 to 33 do D:=D+Abs(PV^[I,J]*Factor-ModelBigrams[I,J]);
if (MinD=0) or (D<MinD) then
begin
MinD:=D;
DetermineRussian:=CP;
end;
end;
end;

begin
{ тест: слово "Пример" в разных кодировках (веpоятность ошибок на таких коpотких текстах высока - в данном слyчае пpосто повезло!) }
writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251);
writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866);
writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R);
readln;
end.



Страницы: 1 вся ветка

Форум: "Основная";
Текущий архив: 2003.11.13;
Скачать: [xml.tar.bz2];

Наверх




Память: 0.47 MB
Время: 0.05 c
14-41898
Dima_dvp
2003-10-23 21:54
2003.11.13
К вопросу об оптимизации при работе с WinAPI


6-41871
Кирил
2003-09-09 20:57
2003.11.13
Помогите, найти ошибку


3-41108
alexay
2003-10-23 09:20
2003.11.13
SQL monitor


3-40847
stud
2003-10-20 16:59
2003.11.13
и опять эти события


3-40799
denmin
2003-10-22 13:56
2003.11.13
Помогите!!! Проблема с передачей параметров!





Afrikaans Albanian Arabic Armenian Azerbaijani Basque Belarusian Bulgarian Catalan Chinese (Simplified) Chinese (Traditional) Croatian Czech Danish Dutch English Estonian Filipino Finnish French
Galician Georgian German Greek Haitian Creole Hebrew Hindi Hungarian Icelandic Indonesian Irish Italian Japanese Korean Latvian Lithuanian Macedonian Malay Maltese Norwegian
Persian Polish Portuguese Romanian Russian Serbian Slovak Slovenian Spanish Swahili Swedish Thai Turkish Ukrainian Urdu Vietnamese Welsh Yiddish Bengali Bosnian
Cebuano Esperanto Gujarati Hausa Hmong Igbo Javanese Kannada Khmer Lao Latin Maori Marathi Mongolian Nepali Punjabi Somali Tamil Telugu Yoruba
Zulu
Английский Французский Немецкий Итальянский Португальский Русский Испанский