海量中文分词 Delphi 例子

简单说“分词”就是指将一个句子拆分为一个个词语。调用的是“海量分词”的DLL库。

海量分词官方网站 http://www.hylanda.com/ (没看到有下载的地方)
试用装下载 http://www.onlinedown.net/soft/39759.htm
例子程序实际来自 https://code.google.com/p/jamessrc/source/checkout

这里我将上面的例子修改为动态调用。其中的 HLSSplit.dat 是DLL自带的分词库,在使用时需要通过 HLSplitInit 来指定这个库的路径。

主要代码例子如下

unit Unit1;

interface

uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, StdCtrls;

type
SHLSegWord=record
  s_szWord:pchar  ; //字符串
  s_dwPOS:longint ; //词性标志
  s_fWeight:single; //关键词权重,如果不是关键词,权重为0
end;

  TForm1 = class(TForm)
    Memo1: TMemo;
    Button1: TButton;
    Memo2: TMemo;
    Label1: TLabel;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

  THLSplitInit = function (lpszDataFilePath:pchar):boolean;cdecl;
  THLFreeSplit = function (): string;cdecl;
  THLOpenSplit = function ():THANDLE;cdecl;
  THLCloseSplit = function (hHandle:THANDLE ):boolean ;cdecl;
  THLSplitWord = function  (hHandle:tHANDLE ; b:LPCSTR ; iExtraCalcFlag:integer = 0):boolean;stdcall;
  THLGetWordCnt = function (hHandle:tHANDLE ): integer;cdecl;
  THLGetWordAt = function (hHandle:tHANDLE; iIndex:integer):Pointer ;cdecl;

var
  Form1: TForm1;

const HL_CAL_OPT_KEYWORD:Integer = $1; //计算关键词附加标识
const HL_CAL_OPT_FINGER: Integer = $2; //计算文章语义指纹标识
const HL_CAL_OPT_POS: Integer = $4; //计算词性标识
const HL_CAL_OPT_SEARCH: Integer = $8;//输出面向检索的分词结果
const NATURE_D_A:Integer = $40000000; // 形容词 形语素
const NATURE_D_B:Integer = $20000000; // 区别词 区别语素
const NATURE_D_C:Integer = $10000000; // 连词 连语素
const NATURE_D_D:Integer = $08000000; // 副词 副语素
const NATURE_D_E:Integer = $04000000; // 叹词 叹语素
const NATURE_D_F:Integer = $02000000; // 方位词 方位语素
const NATURE_D_I:Integer = $01000000; // 成语
const NATURE_D_L:Integer = $00800000; // 习语
const NATURE_A_M:Integer = $00400000; // 数词 数语素
const NATURE_D_MQ:Integer = $00200000; // 数量词
const NATURE_D_N:Integer = $00100000; // 名词 名语素
const NATURE_D_O:Integer = $00080000; // 拟声词
const NATURE_D_P:Integer = $00040000; // 介词
const NATURE_A_Q:Integer = $00020000; // 量词 量语素
const NATURE_D_R:Integer = $00010000; // 代词 代语素
const NATURE_D_S:Integer = $00008000; // 处所词
const NATURE_D_T:Integer = $00004000; // 时间词
const NATURE_D_U:Integer = $00002000; // 助词 助语素
const NATURE_D_V:Integer = $00001000; // 动词 动语素
const NATURE_D_W:Integer = $00000800; // 标点符号
const NATURE_D_X:Integer = $00000400; // 非语素字
const NATURE_D_Y:Integer = $00000200; // 语气词 语气语素
const NATURE_D_Z:Integer = $00000100; // 状态词
const NATURE_A_NR:Integer = $00000080; // 人名
const NATURE_A_NS:Integer = $00000040; // 地名
const NATURE_A_NT:Integer = $00000020; // 机构团体
const NATURE_A_NX:Integer = $00000010; // 外文字符
const NATURE_A_NZ:Integer = $00000008; // 其他专名
const NATURE_D_H:Integer = $00000004; // 前接成分
const NATURE_D_K:Integer = $00000002; // 后接成分


implementation

{$R *.dfm}

function AddNatureString(strWord:string; dwPos:longint):string;
begin
if((dwPos and NATURE_D_A) = NATURE_D_A) then
strWord := strWord +('/a')//形容词
else if((dwPos and NATURE_D_B) = NATURE_D_B) then
strWord := strWord +('/b')//区别词
else if((dwPos and NATURE_D_C) = NATURE_D_C) then
strWord := strWord +('/c')//连词
else if((dwPos and NATURE_D_D) = NATURE_D_D) then
strWord := strWord +('/d')//副词
else if((dwPos and NATURE_D_E) = NATURE_D_E) then
strWord := strWord +('/e')//叹词
else if((dwPos and NATURE_D_F) = NATURE_D_F) then
strWord := strWord +('/f')//方位词
else if((dwPos and NATURE_D_I) = NATURE_D_I) then
strWord := strWord +('/i')//成语
else if((dwPos and NATURE_D_L) = NATURE_D_L) then
strWord := strWord +('/l')//习语
else if((dwPos and NATURE_A_M) = NATURE_A_M) then
strWord := strWord +('/m')//数词
else if((dwPos and NATURE_D_MQ) = NATURE_D_MQ) then
strWord := strWord +('/mq')//数量词
else if((dwPos and NATURE_D_N) = NATURE_D_N) then
strWord := strWord + ('/n')//名词
else if((dwPos and NATURE_D_O) = NATURE_D_O) then
strWord := strWord +('/o')//拟声词
else if((dwPos and NATURE_D_P) = NATURE_D_P) then
strWord := strWord +('/p')//介词
else if((dwPos and NATURE_A_Q) = NATURE_A_Q) then
strWord := strWord +('/q')//量词
else if((dwPos and NATURE_D_R) = NATURE_D_R) then
strWord := strWord +('/r')//代词
else if((dwPos and NATURE_D_S) = NATURE_D_S) then
strWord := strWord +('/s')//处所词
else if((dwPos and NATURE_D_T) = NATURE_D_T) then
strWord := strWord +('/t')//时间词
else if((dwPos and NATURE_D_U) = NATURE_D_U) then
strWord := strWord +('/u')//助词
else if((dwPos and NATURE_D_V) = NATURE_D_V) then
strWord := strWord +('/v')//动词
else if((dwPos and NATURE_D_W) = NATURE_D_W) then
strWord := strWord +('/w')//标点符号
else if((dwPos and NATURE_D_X) = NATURE_D_X) then
strWord := strWord +('/x')//非语素字
else if((dwPos and NATURE_D_Y) = NATURE_D_Y) then
strWord := strWord +('/y')//语气词
else if((dwPos and NATURE_D_Z) = NATURE_D_Z) then
strWord := strWord +('/z')//状态词
else if((dwPos and NATURE_A_NR) = NATURE_A_NR) then
strWord := strWord +('/nr')//人名
else if((dwPos and NATURE_A_NS) = NATURE_A_NS) then
strWord := strWord +('/ns')//地名
else if((dwPos and NATURE_A_NT) = NATURE_A_NT) then
strWord := strWord +('/nt')//机构团体
else if((dwPos and NATURE_A_NX) = NATURE_A_NX) then
strWord := strWord +('/nx')//外文字符
else if((dwPos and NATURE_A_NZ) = NATURE_A_NZ) then
strWord := strWord +('/nz')//其他专名
else if((dwPos and NATURE_D_H) = NATURE_D_H) then
strWord := strWord +('/h')//前接成分
else if((dwPos and NATURE_D_K) = NATURE_D_K) then
strWord := strWord +('/k')//后接成分
else
strWord := strWord +('/?');//未知词性
Result:=strWord;
end;

procedure TForm1.Button1Click(Sender: TObject); //stdcall;
var
bSuccess:bool;
b:LPCSTR;
hHandle,DllHandle:tHANDLE;
i,iExtraCalcFlag,nResultCnt:integer;
strResult,strWord:string ;
pWord:^SHLSegWord ;
  HLSplitInit:THLSplitInit;
  HLFreeSplit:THLFreeSplit;
  HLOpenSplit:THLOpenSplit;
  HLCloseSplit:THLCloseSplit;
  HLSplitWord:THLSplitWord;
  HLGetWordCnt:THLGetWordCnt;
  HLGetWordAt:THLGetWordAt;

begin
  DllHandle:=LoadLibrary(PAnsiChar('HLSSplit.dll'));
  if DllHandle=0 then
    begin
      ShowMessage('Loading DLL Error!');
      exit;
    end;
	
  @HLSplitInit:=GetProcAddress(DllHandle,'HLSplitInit');
  @HLFreeSplit:=GetProcAddress(DllHandle,'HLFreeSplit');
  @HLOpenSplit:=GetProcAddress(DllHandle,'HLOpenSplit');
  @HLCloseSplit:=GetProcAddress(DllHandle,'HLCloseSplit');
  @HLSplitWord:=GetProcAddress(DllHandle,'HLSplitWord');
  @HLGetWordCnt:=GetProcAddress(DllHandle,'HLGetWordCnt');
  @HLGetWordAt:=GetProcAddress(DllHandle,'HLGetWordAt');

    if  (Not Assigned(@HLSplitInit)) or
        (Not Assigned(@HLOpenSplit)) or
        (Not Assigned(@HLCloseSplit)) or
        (Not Assigned(@HLSplitWord)) or
        (Not Assigned(@HLGetWordCnt)) or
        (Not Assigned(@HLGetWordAt))
    then  
      begin
	    showmessage('Loading function Error!');
	    exit;
	 end;  

     //这里说的是词库文件的路径,空表示当前路径
    if(not HLSplitInit('')) then
	 begin
       ShowMessage('海量分词初始化失败!');
	   Exit;	
	 end;

	//创建分词句柄
	hHandle:= HLOpenSplit ();
	if(hHandle = INVALID_HANDLE_VALUE) then
		begin
		  //创建分词句柄失败
		  ShowMessage('创建分词句柄失败!') ;
		  HLFreeSplit () ;//卸载分词字典
		  exit;
		end ;

	//分词并对分词结果进行处理
	iExtraCalcFlag:=HL_CAL_OPT_POS; //附加计算标志,词性计算
	b:=pchar(Memo1.Text);
	bSuccess:= HLSplitWord (hHandle, b, iExtraCalcFlag);

	strResult:= ' ' ;
	if(bSuccess) then
		//分词成功
		begin
			nResultCnt := HLGetWordCnt (hHandle);//取得分词个数
			Label1.Caption:=inttostr(nResultCnt);

			for i:=0 to nResultCnt-1 do
				//取得分词结果
				begin
					pWord:= HLGetWordAt(hHandle , i);//取得一个分词结果
					if pWord.s_szWord<>'' then
						begin
							strWord:=(pWord.s_szWord); //PChar(pWord.s_szWord);
							if(iExtraCalcFlag=HL_CAL_OPT_POS) then//词性
							strWord :=AddNatureString(strWord,pWord.s_dwPOS); //显示词性标准
							strResult := strResult+strWord;
							strResult :=strResult+ (' ') ; //以空格分割分词结果中的每个词
						end
				end
		end
	else
	//分词失败
		begin
		  ShowMessage('分词失败!') ;
		end;
		//卸载分词词典
		//关闭分词句柄
		HLCloseSplit(hHandle);
		//卸载海量分词
		HLFreeSplit();
	Memo2.Text:=strResult;
  FreeLibrary(DllHandle);
end;

end.

 

slip

下载 http://www.lab-z.com/wp-content/uploads/delphi/hlzw.zip

发表评论

电子邮件地址不会被公开。 必填项已用*标注