OpenVMS Source Code Demos
BASIC_ISO_8859_TO_ASCII
1000 %title "iso-8859-to-ascii-xxx.bas" !
%ident "Version_104.4" ! <<<---+
declare string constant k_version = "104.4" , ! <<<---+ &
k_program = "iso-8859-to-ascii" !
!=========================================================================================================================
! title : iso-8859-to-ascii-xxx.BAS
! author : Neil Rieck (https://neilrieck.net/) ([email protected])
! created: 2001-08-22
! caveats: Terminal Emulations (2013):
! 1) this program appears to have some visual glitches when used with Attachmate Reflection 14.1 using default
! settings for VT-320 (Host Character Set: "DEC Suplemental", PC Character Set: "Windows"). Changing the
! host character set to "ISO-Latin-1 (8859-1)" does not improve things while "Windows Latin (1252)" does.
! As expected, changing host character set to "UTF-8" displays UTF pairs as one character.
! 2) this program did not appear to have any visual glitches when used with Tera Term 4.78 where coding defaults
! to "UTF-8" with codepage 65001 (a.k.a. Microsoft). However, all the 2-character UTF data is displayed as
! one character.
!
! History:
! ver who when what
! --- --- ------ ---------------------------------------------------------------------------------------------------------
! 100 NSR 020822 1. original stub (for use else where)
! 101 NSR 081211 1. started work on integrating a unicode translator
! 102 NSR 121009 1. bug fix in unicode decoder
! 103 NSR 121009 1. more work on the test cases
! 104 NSR 130829 1. improved documentation
! 2. changes before copying function unicode_to_iso to our external function library
! 3. created an iso_to_unicode function for better reverse testing
! NSR 130830 4. a few bug fixes in the test case 256
!========================================================================================================================
option type=explicit ! no kid stuff
set no prompt ! no ? on input
!
external string function unicode_to_iso( string, long ) !
external string function iso_to_unicode( string ) !
external string function long_to_hex( long, long ) !
!
declare string constant htab = '9'C , ! horizontal tab &
null = '0'C , ! null &
alpha = "0123456789ABCDEF" !
!
declare string iso_translate$ ,&
input_data$ ,&
temp$ ,&
output_data$ ,&
choice$ ,&
junk$ ,&
junk1$ ,&
junk2$ ,&
junk3$ ,&
long i% ,&
debug% ,&
junk% ,&
column
!
!=======================================================================
! <<< init >>>
!=======================================================================
2000 init:
debug% = 0 !
gosub init_iso_translator !
!
! sample data arrays
!
dim string u$(160 to 255) ! unicode data
dim string h$(160 to 255) ! hex data
declare string r256$ ! raw data
declare string u256$ ! unicode data
dim string g$(10) ! german phrase
!-----------------------------------------------------------------------
! source: http://de.wikipedia.org/wiki/Reinheitsgebot
!
g$(1) = 'Die erste Erw�hnung der Bezeichnung ,Reinheitsgebot" ist in einem'
g$(2) = '!Sitzungsprotokoll des bayrischen Landtags vom 4. M�rz 1918 belegt.'
g$(3) = '!Die Bezeichnung setzte sich jedoch erst allm�hlich durch, au�erhalb'
g$(4) = '!Bayerns erst w�hrend des Streits um das sogenannte ,S��bier" in den'
g$(5) = '!1950er-Jahren. Sowohl bayrische als auch au�erbayrische Zeitungen'
g$(6) = '!berichteten h�ufig'
!-----------------------------------------------------------------------
!
! load sample data arrays
!
for i% = 160 to 255 ! to ÿ
u$(i%) = iso_to_unicode(chr$(i%)) ! create unicode data from chr$(i%)
h$(i%) = long_to_hex(i%,1) ! create hex data from i%
next i% !
!
junk$ = "" ! init
for i% = 160 to 255 ! to ÿ
junk$ = junk$ + chr$(i%) ! build one big string
next i% !
r256$= junk$ ! save a copy for later
u256$= iso_to_unicode (junk$) ! convert to unicode
!
!=======================================================================
! <<< main >>>
!=======================================================================
3000 main:
print k_program +"_"+ k_version !
print string$(len(k_program +"_"+ k_version), asc("=")) ! underline previous line (how will this optimize?)
!
print "test-data menu: "
column = 0 ! init
for i% = 160 to 255 ! 160-255 test cases
print "("; !
print using "<0>##"; i%; !
print ") hex="; !
print h$(i%); !
print " utf=";u$(i%);" "; !
column = column + 1 !
if column = 5 then !
column = 0 !
print !
end if !
next i% !
print ! EOL
print "(256) raw="; r256$ !
print " utf="; u256$ !
print "(257) decode German phrases" !
print "enter? (160-257, or raw text) "; !
when error in !
linput choice$ !
use !
print "-e-error: "+str$(err)+" during input" !
end when !
when error in !
junk% = integer(choice$) !
use !
junk% = 999 !
end when !
select junk% !
case < 160 !
goto exit_now !
case 160 to 255 !
temp$ = u$(junk%) !
case 256 !
temp$ = u256$ !
case 257 !
i% = 1 ! init
while g$(i%) <> "" ! scan the array
junk1$ = g$(i%) ! copy and remember phrase
print str$(i%)+" german : "+ junk1$ ! show the original phrase
junk2$ = iso_to_unicode( junk1$ ) ! convert to unicode
print str$(i%)+" unicode: ";junk2$ ! show it
junk3$ = unicode_to_iso(junk2$,0) ! convert to iso
print str$(i%)+" iso : ";junk3$; ! show it (should be the same)
if junk1$ = junk3$ then !
print " (same)" !
else !
print " (different)" !
end if !
i% = i% + 1 !
next !
goto exit_now !
case 999 !
temp$ = edit$( choice$, 128+16+8) ! no trailing, compress, no leading
case else !
goto exit_now !
end select !
print "-i- utf data : "+ temp$ !
temp$ = unicode_to_iso(temp$,debug%) !
print "-i- 8-bit ASCII: "+ temp$ !
output_data$ = xlate$( temp$, iso_translate$) ! translate ISO -> ASCII
print "-i- 7-bit ASCII: "+ output_data$ !
exit_now:
print "============================================================"
goto fini !
!
!========================================================================================
!
! <<< init ISO-8859-1 character translator >>>
!
! build an ASCII translation table
! notes: 1. remember that <NUL> is in position #1 of iso_translate$
! 2. this routine converts some 8-bit characters into 7-bit via translation
!========================================================================================
init_iso_translator:
!
iso_translate$ = "" ! init
!
%let %paranoid = 1% ! paranoid filtering (lowest 14 become <null>)
%if %paranoid = 0% !
%then ! normal filtering (lowest 14 are as-is)
for i% = 0 to 13 ! build 7-bit table ( from <NUL> to <CR> )
iso_translate$ = iso_translate$ + chr$(i%) ! use these as-is
next i% !
%else ! paranoid filtering (lowest 14 become <null>)
for i% = 0 to 13 ! build 7-bit table ( from <NUL> to <CR> )
iso_translate$ = iso_translate$ + null ! change these to <null>
next i% !
mid$( iso_translate$, 9 +1, 1) = htab ! restore our delimiter
mid$( iso_translate$, 13 +1, 1) = " " ! convert <cr> to <sp>
%end %if !
for i% = 14 to 31 ! build 7-bit table ( from <SO> to <US> )
iso_translate$ = iso_translate$ + null ! change these to <null>
next i% !
for i% = 32 to 127 ! build 7-bit table
iso_translate$ = iso_translate$ + chr$(i%) ! use these as-is
next i% !
!
! now patch the lower ascii translation table as required by your application
!
mid$( iso_translate$, 96% +1, 1) = "'" ! convert "`" to "'"
!~~~ mid$( iso_translate$, 123% +1, 1) = " " x map "{" to <sp>
!~~~ mid$( iso_translate$, 124% +1, 1) = " " x map "}" to <sp>
!~~~ mid$( iso_translate$, 125% +1, 1) = " " x map "|" to <sp>
!~~~ mid$( iso_translate$, 126% +1, 1) = null x map "~" to <null>
mid$( iso_translate$, 127% +1, 1) = null ! map <DEL> to <null>
!
! this area contains the second half of the 8 bit character set
!
for i% = 128 to 159 !
iso_translate$ = iso_translate$ + null ! change these to <null>
next i% !
for i% = 160 to 255 !
iso_translate$ = iso_translate$ + " " ! change these to <sp>
next i% !
!
! now patch the upper ascii translation table (a subjective translation of ISO-8859-1)
!
mid$( iso_translate$, 160 +1, 1) = " " ! non-break space
mid$( iso_translate$, 161 +1, 1) = "!" ! inverted exclamation
mid$( iso_translate$, 162 +1, 1) = "c" ! cents sign
mid$( iso_translate$, 163 +1, 1) = "$" ! pound sign
mid$( iso_translate$, 164 +1, 1) = "$" ! currency sign
mid$( iso_translate$, 165 +1, 1) = "$" ! yen sign
!
mid$( iso_translate$, 169 +1, 1) = "c" ! copyright sign
!
mid$( iso_translate$, 171 +1, 1) = "<" ! left angle quotes
!
mid$( iso_translate$, 174 +1, 1) = "r" ! registered trademark
!
mid$( iso_translate$, 177 +1, 1) = "+" ! plus-minus sign
!
mid$( iso_translate$, 180 +1, 1) = "'" ! spacing acute
mid$( iso_translate$, 181 +1, 1) = "u" ! mirco sign
!
mid$( iso_translate$, 183 +1, 1) = "." ! middle dot
mid$( iso_translate$, 184 +1, 1) = "," ! spacing cedilla
!
mid$( iso_translate$, 187 +1, 1) = ">" ! right angle quotes
!
mid$( iso_translate$, 191 +1, 1) = "?" ! inverted question mark
mid$( iso_translate$, 192 +1, 1) = "A" ! A grave
mid$( iso_translate$, 193 +1, 1) = "A" ! acute
mid$( iso_translate$, 194 +1, 1) = "A" ! circumflex
mid$( iso_translate$, 195 +1, 1) = "A" ! tilde
mid$( iso_translate$, 196 +1, 1) = "A" ! diaresis
mid$( iso_translate$, 197 +1, 1) = "A" ! ring
mid$( iso_translate$, 198 +1, 1) = "A" ! ligature
mid$( iso_translate$, 199 +1, 1) = "C" ! C cedilla
mid$( iso_translate$, 200 +1, 1) = "E" ! E grave
mid$( iso_translate$, 201 +1, 1) = "E" ! acute
mid$( iso_translate$, 202 +1, 1) = "E" ! circumflex
mid$( iso_translate$, 203 +1, 1) = "E" ! diaresis
mid$( iso_translate$, 204 +1, 1) = "I" ! I grave
mid$( iso_translate$, 205 +1, 1) = "I" ! acute
mid$( iso_translate$, 206 +1, 1) = "I" ! circumflex
mid$( iso_translate$, 207 +1, 1) = "I" ! diaresis
mid$( iso_translate$, 208 +1, 1) = "D" ! ETH
mid$( iso_translate$, 209 +1, 1) = "N" ! N tilde
mid$( iso_translate$, 210 +1, 1) = "O" ! O grave
mid$( iso_translate$, 211 +1, 1) = "O" ! acute
mid$( iso_translate$, 212 +1, 1) = "O" ! circumflex
mid$( iso_translate$, 213 +1, 1) = "O" ! tilde
mid$( iso_translate$, 214 +1, 1) = "O" ! diaresis
mid$( iso_translate$, 215 +1, 1) = "x" ! Multiply Sign
mid$( iso_translate$, 216 +1, 1) = "O" ! O slash
mid$( iso_translate$, 217 +1, 1) = "U" ! U grave
mid$( iso_translate$, 218 +1, 1) = "U" ! acute
mid$( iso_translate$, 219 +1, 1) = "U" ! circumflex
mid$( iso_translate$, 220 +1, 1) = "U" ! diaresis
mid$( iso_translate$, 221 +1, 1) = "Y" ! Y acute
mid$( iso_translate$, 222 +1, 1) = "p" ! thorn
mid$( iso_translate$, 223 +1, 1) = "B" ! sharp s
mid$( iso_translate$, 224 +1, 1) = "a" ! a grave
mid$( iso_translate$, 225 +1, 1) = "a" ! acute
mid$( iso_translate$, 226 +1, 1) = "a" ! circumflex
mid$( iso_translate$, 227 +1, 1) = "a" ! tilde
mid$( iso_translate$, 228 +1, 1) = "a" ! diaeresis
mid$( iso_translate$, 229 +1, 1) = "a" ! ring
mid$( iso_translate$, 230 +1, 1) = "a" ! ligature
mid$( iso_translate$, 231 +1, 1) = "c" ! c cedilla
mid$( iso_translate$, 232 +1, 1) = "e" ! e grave
mid$( iso_translate$, 233 +1, 1) = "e" ! acute
mid$( iso_translate$, 234 +1, 1) = "e" ! circumflex
mid$( iso_translate$, 235 +1, 1) = "e" ! diaeresis
mid$( iso_translate$, 236 +1, 1) = "i" ! i grave
mid$( iso_translate$, 237 +1, 1) = "i" ! acute
mid$( iso_translate$, 238 +1, 1) = "i" ! circumflex
mid$( iso_translate$, 239 +1, 1) = "i" ! diaeresis
mid$( iso_translate$, 240 +1, 1) = "o" ! eth
mid$( iso_translate$, 241 +1, 1) = "n" ! n tilde
mid$( iso_translate$, 242 +1, 1) = "o" ! o grave
mid$( iso_translate$, 243 +1, 1) = "o" ! acute
mid$( iso_translate$, 244 +1, 1) = "o" ! circumflex
mid$( iso_translate$, 245 +1, 1) = "o" ! tilde
mid$( iso_translate$, 246 +1, 1) = "o" ! diaeresis
mid$( iso_translate$, 247 +1, 1) = "/" ! division sign
mid$( iso_translate$, 248 +1, 1) = "o" ! o slash
mid$( iso_translate$, 249 +1, 1) = "u" ! u grave
mid$( iso_translate$, 250 +1, 1) = "u" ! acute
mid$( iso_translate$, 251 +1, 1) = "u" ! circumflex
mid$( iso_translate$, 252 +1, 1) = "u" ! diaeresis
mid$( iso_translate$, 253 +1, 1) = "y" ! y acute
mid$( iso_translate$, 254 +1, 1) = "b" ! thorn
mid$( iso_translate$, 255 +1, 1) = "y" ! y diaeresis
return !
!=======================================================================
! <<< adios... >>>
!=======================================================================
31000 fini: !
end !
!
!########################################################################################################################
!
!======================================================================================
! Title : unicode to iso
! note : UTF-8 encoding (see RFC 2279) http://www.faqs.org/rfcs/rfc2279.html
! entry : string data to scan
! return: resultant string
!
! UCS-4 range UTF-8 octet sequence (binary) payload data
! ------------------- ----------------------------- ------------
! 0000,0000-0000,007F 0xxxxxxx 7-data bits
! 0000,0080-0000,07FF 110xxxxx 10xxxxxx 11-data bits
! 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx 16-data bits
! 0001,0000-001F,FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 21-data bits
! 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 26-data bits
! 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 31-data bits
!=======================================================================================
32000 function string unicode_to_iso( string inbound$, long debug% ) !
option type=explicit !
declare string cpy$, &
temp$, &
long i%, j%, k%, z%
!
cpy$ = inbound$ ! copy original data
for i% = 1 to len(cpy$) ! scan the string
j% = asc(mid$(cpy$,i%,1)) ! test current character
!
! 7-bit test
!
if (j% and B"10000000"L) = 0 then !
goto next_character ! leave "as-is"
end if !
!
! 2-character test (first 3-bits of byte-1 must be "110")
! (first 2-bits of byte-2 must be "10" )
!
if (j% and B"11100000"L) = 192 then ! must only be 110xxxxx
!
! 2-character transformations
! byte-1 byte-2
! 110xxxxx 10xxxxxx
!
j% = j% and B"11111"L ! keep lower 5 bits for processing
j% = j% * 64% ! shift left by 6 places (prep for merge)
k% = asc(mid$(cpy$,i%+1,1)) ! grab next character
if (k% and B"11000000"L) <> 128 then ! must only be 10xxxxxx
select debug% !
case 0 !
case 1 to 90 !
print "-e-unicode sanity error" !
print "-i-character: "+str$(i%+1) !
case 91 to 109 !
when error in !
print #debug%,"-e-unicode sanity error" !
print #debug%,"-i-character: "+str$(i%+1) !
use !
end when !
end select !
goto next_character !
end if !
k% = k% and B"111111"L ! keep lower 6 bits for processing
z% = j% or k% ! merge the bits
mid$(cpy$,i% ,1) = chr$((z% and B"111100000000"L)/256%) ! process and write back to char+0
mid$(cpy$,i%+1,1) = chr$( z% and B"11111111"L) ! process and write back to char+1
i% = i% + 1 ! fix pointer
end if !
!
! 3-character test (not yet required on our system)
!
if (j% and B"11110000"L) = 224 then ! keep 4 highest bits for group test
end if !
!
! 4-character test (not yet required on our system)
!
if (j% and B"11111000"L) = 240 then ! keep 5 highest bits for group test
end if !
!
! 5-character test (not yet required on our system)
!
if (j% and B"11111100"L) = 248 then ! keep 6 highest bits for group test
end if !
!
! 7-character test (not yet required on our system)
!
if (j% and B"11111110"L) = 252 then ! keep 6 highest bits for group test
end if !
!
! if we get here then the character was a natural 8-bit byte
!
next_character: !
next i% !
!
cpy$ = edit$(cpy$, 4) ! remove control characters
if cpy$ <> inbound$ then !
select debug% !
case 0 !
case 1 to 90 !
print "===============================================" !
print "-i- UTF-8 : "+ inbound$ !
print "-i- ISO-8859-1 : "+ cpy$ !
print "===============================================" !
case 91 to 109 !
when error in
print #debug%,"-i- UTF-8 : "+ inbound$ !
print #debug%,"-i- ISO-8859-1 : "+ cpy$ !
use !
end when !
end select !
end if !
!
function_exit: !
unicode_to_iso = cpy$ !
end function !
!
!=======================================================================================
! Title : iso to unicode
! note : UTF-8 encoding (see RFC 2279) http://www.faqs.org/rfcs/rfc2279.html
! entry : string data to scan
! return: resultant string
!
! UCS-4 range (hex.) UTF-8 octet sequence (binary) 4-bit nibble payload data
! ------------------- ----------------------------- ------------ ------------
! 0000 0000-0000 007F 0xxxxxxx 7-data bits
! 0000 0080-0000 07FF 110xxxxx 10xxxxxx 0xCx-0xDx 11-data bits
! 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 0xEx 16-data bits
! 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 0xFx 21-data bits
! 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0xFx 26-data bits
! 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0xFx 31-data bits
!=======================================================================================
32010 function string iso_to_unicode( string inbound$ ) !
option type=explicit !
declare string out$, &
temp$, &
long i%, j%, k%, l%
!
out$ = "" ! init
for i% = 1 to len(inbound$) ! scan the string
j% = asc(mid$(inbound$,i%,1)) ! test current character
select j% !
case 0 to 127 ! must be 7-bit data
out$ = out$ + chr$(j%) ! use "as-is"
case 128 to 255 !
l% = j% and B"00111111"L ! isolate lower 6-bits for processing
k% = j% and B"11000000"L ! isolate upper 2-bits for processing
k% = k% / 64 ! shift right by 6 bits
out$ = out$ + chr$(X"c0"L or k%) + chr$(128% or l%) ! create unicode data from c2
end select !
next i% !
!
function_exit: !
iso_to_unicode = out$ !
end function !
!========================================================================
! long_to_hex()
! entry: inbound% (data to convert)
! size% (size in bytes)
!========================================================================
32020 function string long_to_hex( long inbound%, long size% )
option type=explicit
declare string constant alpha = "0123456789ABCDEF" !
declare long i%, z%, temp% !
declare string junk$
!
temp% = inbound% !
for i%= 1 to (size% * 2) !
z% = (temp% and 15%) ! isolate last nibble
junk$ = mid$(alpha,z%+1,1) + junk$ !
temp% = temp% / 16% ! shift by four bits
next i%
!
function_exit: !
long_to_hex = junk$ !
end function !
!========================================================================