Context Navigation

Back to Ticket #1123

close Warning: Can't synchronize with repository "(default)" (/var/svn/tolp does not appear to be a Subversion repository.). Look in the Trac log for more information.

Ticket #1123: fun_utf8.tol

File fun_utf8.tol, 7.9 KB (added by Pedro Gea, 14 years ago)

Line
1	//////////////////////////////////////////////////////////////////////////////
2	// Unicode y codificación UTF8
3	//
4	// Unicode es un estándar de codificación de caracteres.
5	// http://es.wikipedia.org/wiki/Unicode
6	//
7	// El bloque básico multilingüistico utiliza el intervalo hexadecimal:
8	// 0x0000-0xFFFF correspondiente al decimal (0-65535).
9	// Estos caracteres necesitan de 16 bits (2 bytes) para expresarse.
10	//
11	// UTF8 es un sistema para codificar los caracteres Unicode expresándolos
12	// como una combinación de caracteres de un byte.
13	// http://es.wikipedia.org/wiki/UTF-8
14	//
15	// De acuerdo a la codificación UTF-8:
16	// * El intervalo 0-127 se codifica con un carácter.
17	// * El intervalo 128-2047 con dos caracteres.
18	// * El intervalo 2048-65535 con tres caracteres.
19	// * El resto con cuatro caracteres.
20	//
21	// La conversión de caracteres se hace de acuerdo a los siguientes criterios:
22	// (1) Intervalo 0-127 ['00000000'-'01111111']
23	// unicode: '0xxxxxxx'
24	// utf8: '0xxxxxxx'
25	// (2) Intervalo 128-2047 ['10000000'-'00000111 11111111']
26	// unicode: '00000xxx yyzzzzzz'
27	// utf8: '110xxxyy' '10zzzzzz'
28	// (3) Intervalo 2048-65535 ['00001000 00000000'-'10000000 00000000']
29	// unicode: 'xxxxyyyy wwzzzzzz'
30	// utf8: '1110xxxx' '10yyyyww' '10zzzzzz'
31	//
32	// ASCII y Latin1
33	//
34	// La correspondencia de los 128 primeros caracteres siguen el estándar ASCII.
35	// http://es.wikipedia.org/wiki/Ascii
36	// Los 128 caracteres siguientes pueden ser codificados de distintos modos
37	// de acuerdo al estándar ISO/IEC 8859.
38	// Concretamente gran parte de las lenguas europeas (entre ellas el inglés,
39	// el español y el portugués) se codifican con la ISO 8859-1 (ISO Latin 1).
40	// http://es.wikipedia.org/wiki/ISO_8859-1
41	//
42	//////////////////////////////////////////////////////////////////////////////
43
44	//////////////////////////////////////////////////////////////////////////////
45	Text CharacterFromUTF8(Text utf8)
46	//////////////////////////////////////////////////////////////////////////////
47	{
48	Real len = TextLength(utf8);
49	Case(len<=1, {
50	utf8
51	}, len==2, {
52	Real u1 = ASCII(Sub(utf8,1,1)); // '110xxxyy'
53	Real u2 = ASCII(Sub(utf8,2,2)); // '10zzzzzz'
54	Real xxxyy = u1 - 192; // u1 - '11000000'
55	Real zzzzzz = u2 - 128; // u2 - '10000000'
56	Real yy = xxxyy % 4; // 'xxxyy' % '100'
57	Real xxx = (xxxyy - yy)/4; // 'xxx00' / '100'
58	Real unicode = xxx256 + yy64 + zzzzzz; // 'xxx yyzzzzzz'
59	Char(unicode)
60	}, len==3, {
61	Real u1 = ASCII(Sub(utf8,1,1)); // '1110xxxx'
62	Real u2 = ASCII(Sub(utf8,2,2)); // '10yyyyww'
63	Real u3 = ASCII(Sub(utf8,3,3)); // '10zzzzzz'
64	Real xxxx = u1 - 224; // u1 - '11100000'
65	Real yyyyww = u2 - 128; // u2 - '10000000'
66	Real ww = yyyyww % 4; // 'yyyyww' % '100'
67	Real yyyy = (yyyyww - ww)/4; // 'yyyy00' / '100'
68	Real zzzzzz = u3 - 128; // u3 - '10000000'
69	Real v1 = xxxx*16 + yyyy; // 'xxxxyyyy'
70	Real v2 = ww*64 + zzzzzz; // 'wwzzzzzz'
71	Real unicode = v1*256 +v2; // 'xxxxyyyy wwzzzzzz'
72	Char(unicode)
73	}, True, {
74	WriteLn("[CharacterFromUTF8] No está implementada la conversión de UTF-8 "
75	<<"codificado con 4 caracteres.", "E");
76	""
77	})
78	};
79	//////////////////////////////////////////////////////////////////////////////
80	PutDescription(
81	"Devuelve el caracter Unicode codificado con el estándar UTF-8 indicado.\n"
82	"Por ejemplo: CharacterFromUTF8(\"Ã³\") -> \"ó\"", CharacterFromUTF8);
83	//////////////////////////////////////////////////////////////////////////////
84
85	//////////////////////////////////////////////////////////////////////////////
86	Text CharacterToUTF8(Text char)
87	//////////////////////////////////////////////////////////////////////////////
88	{
89	Real len = TextLength(char);
90	Case(len==0, {
91	""
92	}, len==1, {
93	Real v = ASCII(char);
94	Case(v <= 127, {
95	char
96	}, v <= 2047, { // 'xxx yyzzzzzz'
97	Real yyzzzzzz = v % 256; // 'xxx yyzzzzzz' % '1 00000000'
98	Real xxx = (v - yyzzzzzz)/256; // 'xxx 00000000' / '1 00000000'
99	Real zzzzzz = yyzzzzzz % 64; // 'yyzzzzzz' % '01000000'
100	Real yy = (yyzzzzzz - zzzzzz)/64; // 'yy000000' % '01000000'
101	Real u1 = 192 + xxx*4 + yy; // '110xxxyy'
102	Real u2 = 128 + zzzzzz; // '10zzzzzz'
103	Char(u1)<<Char(u2)
104	}, v <= 65535, { // 'xxxxyyyy wwzzzzzz'
105	Real wwzzzzzz = v % 256; // 'xxxxyyyy wwzzzzzz' % '1 00000000'
106	Real xxxxyyyy = (v - wwzzzzzz)/256; // 'xxxxyyyy 00000000' / '1 00000000'
107	Real zzzzzz = wwzzzzzz % 64; // 'wwzzzzzz' % '01000000'
108	Real ww = (wwzzzzzz - zzzzzz)/64; // 'ww000000' % '01000000'
109	Real yyyy = xxxxyyyy % 16; // 'xxxxyyyy' % '00010000'
110	Real xxxx = (xxxxyyyy - yyyy)/16; // 'xxxx0000' % '00010000'
111	Real u1 = 224 + xxxx; // '1110xxxx'
112	Real u2 = 128 + yyyy*4 + ww; // '10yyyyww'
113	Real u3 = 128 + zzzzzz; // '10zzzzzz'
114	Char(u1)<<Char(u2)<<Char(u3)
115	}, True, {
116	WriteLn("[CharacterToUTF8] No está implementada la conversión a UTF-8 "
117	<<"codificado con 4 caracteres.", "E");
118	""
119	})
120	}, True, {
121	WriteLn("[CharacterToUTF8] El argumento de la función ha de ser un único "
122	<<" carácter.", "E");
123	""
124	})
125	};
126	//////////////////////////////////////////////////////////////////////////////
127	PutDescription(
128	"Devuelve un caracter Unicode usando sólo caracteres ASCII de acuerdo con el "
129	"estándar UTF-8.\n"
130	"Por ejemplo: CharacterToUTF8(\"ó\") -> \"Ã³\"", CharacterToUTF8);
131	//////////////////////////////////////////////////////////////////////////////
132
133	//////////////////////////////////////////////////////////////////////////////
134	Text TextFromUTF8(Text txt)
135	//////////////////////////////////////////////////////////////////////////////
136	{
137	Text new = "";
138	Real i = 1;
139	Real While(i <= TextLength(txt), {
140	Text ch = Sub(txt,i,i);
141	Text utf8 = Case(ASCII(ch)<192, {
142	Text ch
143	}, And(ASCII(ch)>=192, ASCII(ch)<224), {
144	Text ch2 = Sub(txt,i+1,i+1);
145	Text If(And(ASCII(ch2)>=128, ASCII(ch2)<192), {
146	Real i := i + 1;
147	Text CharacterFromUTF8(ch<<ch2)
148	}, {
149	Text ch
150	})
151	}, And(ASCII(ch)>=224, ASCII(ch)<240), {
152	Text ch2 = Sub(txt,i+1,i+1);
153	Text ch3 = Sub(txt,i+2,i+2);
154	Text If(And(ASCII(ch2)>=128, ASCII(ch2)<192,
155	ASCII(ch3)>=128, ASCII(ch3)<192), {
156	Real i := i + 2;
157	Text CharacterFromUTF8(ch<<ch2<<ch3)
158	}, {
159	Text ch
160	})
161	}, True, {
162	Text ch
163	});
164	Text new := new << utf8;
165	Real i := i + 1
166	});
167	Text ""<<new
168	};
169	//////////////////////////////////////////////////////////////////////////////
170	PutDescription(
171	"Devuelve con caracteres Unicode un texto codificado con el estándar UTF-8.\n"
172	"Por ejemplo: TextFromUTF8(\"CigÃŒeÃ±a\") -> \"Cigüeña\"", TextFromUTF8);
173	//////////////////////////////////////////////////////////////////////////////
174
175	//////////////////////////////////////////////////////////////////////////////
176	Text TextToUTF8(Text txt)
177	//////////////////////////////////////////////////////////////////////////////
178	{
179	Text new = "";
180	Real i = 1;
181	Real While(i <= TextLength(txt), {
182	Text ch = Sub(txt,i,i);
183	Text new := new << CharacterToUTF8(ch);
184	Real i := i + 1
185	});
186	Text ""<<new
187	};
188	//////////////////////////////////////////////////////////////////////////////
189	PutDescription(
190	"Devuelve el texto codificando los caracteres Unicode usando sólo caracteres "
191	"ASCII de acuerdo con el estándar UTF-8.\n"
192	"Por ejemplo: TextToUTF8(\"Cigüeña\") -> \"CigÃŒeÃ±a\"", TextToUTF8);
193	//////////////////////////////////////////////////////////////////////////////

Download in other formats:

Original Format