gb2312转utf-8程序

时间:2006-12-21 15:07:02   来源:  作者:whsong  点击:次  出处:技术无忧
关键字:gb23

<?
// Program by Donald,Milddragon Studio.
// Email: wilddragon#sina.com
// gb2312.txt请用google搜索下载

//初始化gb2312--unicode数组对应表作为全程变量,以提高处理速度
$____global_codetable=array();
$____global_filename=pathinfo($_SERVER["SCRIPT_FILENAME"]);
$____global_filename=$____global_filename["dirname"]."/gb2312.txt";
$____global_tmp=file($____global_filename);
while(list($key,$value)=each($____global_tmp))
{
if (strcmp($value{0},’#’)!=0)
$____global_codetable[hexdec(substr($value,2,4))]=substr($value,9,4);
}
reset($____global_tmp);
while(list($key,$value)=each($____global_tmp))
{
if (strcmp($value{0},’#’)!=0)
$____global_codetable2[hexdec(substr($value,9,4))]=hexdec(substr($value,2,4));
}
unset($____global_filename);
unset($____global_tmp);


/*
将带 㾏協格式的文本(可以包含其它ASCII字符)转换成gb2312格式的文本;
可以用于XML编码的转换
需要注意的是,函数不改变xml中关于编码的声明
*/
function unicode2gb($un)
{
if(!trim($un))
return $un;
$gb="";
global $____global_codetable2;
while(strlen($un)>0)
{
$p=strpos($un,"&#");
if ($p===FALSE)//串中已无unicode字符
{
  $gb.=$un;
  return $gb;
}
else
{
  if ($p!=0)//串中unicode字符前缀不是第一个字符
  {
  $gb.=substr($un,0,$p);
  $un=substr($un,$p);
  }
  $p=strpos($un,";");
  if ($p===FALSE)//此前缀非unicode前缀,串中已无unicode字符
  {
      $gb.=$un;
      return $gb;
  }
  else
  {
  $code=substr($un,2,$p-2);
  $un=substr($un,$p+1);
  if (strcasecmp($code{0},"x")==0)//unicode码16进制表示
  {
  $code=hexdec(substr($code,1));
  }else
  {
  $code=intval($code);
  }
      $code=0x8080|$____global_codetable2[$code];
  $gb.=chr((($code & 0xFF00)>>8) & 0xFF);
  $gb.=chr($code & 0xFF);
  }
}
}
return $gb;
}

/*
将 gb2312格式的文本(可以包含其它ASCII字符)转化为 带 㾏協格式的unicode文本;
可以用于XML编码的转换
需要注意的是,函数不改变xml中关于编码的声明
*/
function gb2unicode($gb)
{
  if(!trim($gb))
    return $gb;
  $utf="";
  global $____global_codetable;
  while(strlen($gb)>0)
  {
  if (ord(substr($gb,0,1))>127)
    {
      $this=substr($gb,0,2);
      $gb=substr($gb,2);
      $code=$____global_codetable[hexdec(bin2hex($this))&0x7F7F];
      $utf.="&#x".$code.";";
    }
  else
    {
      $utf.=substr($gb,0,1);
      $gb=substr($gb,1);
    }
  }
  return $utf;
}

/*
将utf8格式的文本转化为gb2312格式的文本;这与上述的unicode2gb不同,是二进制格式的转换
*/
function utf82gb($utf8)
{
  if(!trim($utf8))
    return $utf8;
  global $____global_codetable2;
  $gb="";
  while(strlen($utf8)>0)
  {
$c=substr($utf8,0,1);
$d=ord($c);
if (($d&0x80) == 0)//1位
{
$gb.=$c;
$utf8=substr($utf8,1);
}
else
if (($d&0xC0)==0x80)//错位
{
$utf8=substr($utf8,1);
}
else
if (($d&0xE0)==0xC0)//2位
{
$utf8=substr($utf8,2);
}
else
if (($d&0xF0)==0xE0)//3位
{
$d1=ord($utf8{1}) & 0x3F;
$d2=ord($utf8{2}) & 0x3F;
$d=$d & 0x0F;
$d=($d<<12) + ($d1 <<6) + $d2;
      $code=0x8080|$____global_codetable2[$d];
$gb.=chr((($code & 0xFF00)>>8) & 0xFF);
$gb.=chr($code & 0xFF);
$utf8=substr($utf8,3);
}
else
if (($d&0xF8)==0xF0)//4位
{
$d1=ord($utf8{1}) & 0x3F;
$d2=ord($utf8{2}) & 0x3F;
$d3=ord($utf8{3}) & 0x3F;
$d=$d & 0x07;
$d=($d<<18) + ($d1 <<12) + ($d2 << 6) +$d3;
//$code=0x8080+getgb($d);
      $code=0x8080|$____global_codetable2[$d];
$gb.=chr((($code & 0xFF00)>>8) & 0xFF);
$gb.=chr($code & 0xFF);
$utf8=substr($utf8,4);
}
else
{
$utf8=substr($utf8,1);
}
  }
  return $gb;
}

/*
将gb2312格式的文本转化为utf8格式的文本;这与上述的gb2unicode不同,是二进制格式的转换
*/
function gb2utf8($gb)
{
  if(!trim($gb))
    return $gb;
  global $____global_codetable;
  $utf8="";
  while(strlen($gb)>0)
  {
if (ord(substr($gb,0,1))>127)
{
      $code=substr($gb,0,2);
      $gb=substr($gb,2);
      //echo "gb=$code;";
      $code=bin2hex($code);
      //echo "code=$code;";
      $code=hexdec($code)&0x7F7F;
      //echo "newcode=".dechex($code);
      $code=$____global_codetable[$code];
      //echo "unicode=$code";
      $code=hexdec($code);
      //11位:6+5
      if (($code&0x7FF)==$code)
      {
      $utf8.=chr(0xC0|((($code&0x7C0)>>6)&0x3F));
      $utf8.=chr(0x80|($code&0x3F));
      }else
      //16位:12+4
      if (($code&0xFFFF)==$code)
      {
      $utf8.=chr(0xE0|((($code&0xF000)>>12)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0)>>6)&0x3F));
      $utf8.=chr(0x80|($code&0x3F));
      //echo "16位==$utf8;/n";
      }
      else
      //21位:18+3
      if (($code&0x1FFFFF)==$code)
      {
      $utf8.=chr(0xF0|((($code&0x1C0000)>>18)&0x3F));
      $utf8.=chr(0x80|((($code&0x3F000)>>12)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0)>>6)&0x3F));
      $utf8.=chr(0x80|($code&0x3F));
      }
      /*
      else
      //26位:24+2
      if (($code&0x3FFFFFF)==$code)
      {
      $utf8.=chr(0xF8|((($code&0x3000000)>>24)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0000)>>18)&0x3F));
      $utf8.=chr(0x80|((($code&0x3F000)>>12)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0)>>6)&0x3F));
      $utf8.=chr(0x80|($code&0x3F));
      }
      else
      //31位:30+1
      if (($code&0x7FFFFFFF)==$code)
      {
      $utf8.=chr(0xFC|((($code&0x40000000)>>30)&0x3F));
      $utf8.=chr(0x80|((($code&0x3F000000)>>24)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0000)>>18)&0x3F));
      $utf8.=chr(0x80|((($code&0x3F000)>>12)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0)>>6)&0x3F));
      $utf8.=chr(0x80|($code&0x3F));
      }
      //36位
      else
      {
      //首字节全部作为前缀,无数据
      $utf8.=chr(0x80|((($code&0xC0000000)>>30)&0x3F));
      $utf8.=chr(0x80|((($code&0x3F000000)>>24)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0000)>>18)&0x3F));
      $utf8.=chr(0x80|((($code&0x3F000)>>12)&0x3F));
      $utf8.=chr(0x80|((($code&0xFC0)>>6)&0x3F));
      $utf8.=chr(0x80|($code&0x3F));
      }
      */
}
else
{
$utf8.=substr($gb,0,1);
$gb=substr($gb,1);
}
}
return $utf8;
}
?>

访问技术无忧网,软硬件通吃保你技术无忧!中文网址http://www.技术无忧.com 或 http://www.技术无忧.net


相关文章

    无相关信息

文章评论

共有 0 位网友发表了评论 此处只显示部分留言 点击查看完整评论页面