简单说“分词”就是指将一个句子拆分为一个个词语。调用的是“海量分词”的DLL库。
海量分词官方网站 http://www.hylanda.com/ (没看到有下载的地方)
试用装下载 http://www.onlinedown.net/soft/39759.htm
例子程序实际来自 https://code.google.com/p/jamessrc/source/checkout
这里我将上面的例子修改为动态调用。其中的 HLSSplit.dat 是DLL自带的分词库,在使用时需要通过 HLSplitInit 来指定这个库的路径。
主要代码例子如下
unit Unit1; interface uses Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms, Dialogs, StdCtrls; type SHLSegWord=record s_szWord:pchar ; //字符串 s_dwPOS:longint ; //词性标志 s_fWeight:single; //关键词权重,如果不是关键词,权重为0 end; TForm1 = class(TForm) Memo1: TMemo; Button1: TButton; Memo2: TMemo; Label1: TLabel; procedure Button1Click(Sender: TObject); private { Private declarations } public { Public declarations } end; THLSplitInit = function (lpszDataFilePath:pchar):boolean;cdecl; THLFreeSplit = function (): string;cdecl; THLOpenSplit = function ():THANDLE;cdecl; THLCloseSplit = function (hHandle:THANDLE ):boolean ;cdecl; THLSplitWord = function (hHandle:tHANDLE ; b:LPCSTR ; iExtraCalcFlag:integer = 0):boolean;stdcall; THLGetWordCnt = function (hHandle:tHANDLE ): integer;cdecl; THLGetWordAt = function (hHandle:tHANDLE; iIndex:integer):Pointer ;cdecl; var Form1: TForm1; const HL_CAL_OPT_KEYWORD:Integer = $1; //计算关键词附加标识 const HL_CAL_OPT_FINGER: Integer = $2; //计算文章语义指纹标识 const HL_CAL_OPT_POS: Integer = $4; //计算词性标识 const HL_CAL_OPT_SEARCH: Integer = $8;//输出面向检索的分词结果 const NATURE_D_A:Integer = $40000000; // 形容词 形语素 const NATURE_D_B:Integer = $20000000; // 区别词 区别语素 const NATURE_D_C:Integer = $10000000; // 连词 连语素 const NATURE_D_D:Integer = $08000000; // 副词 副语素 const NATURE_D_E:Integer = $04000000; // 叹词 叹语素 const NATURE_D_F:Integer = $02000000; // 方位词 方位语素 const NATURE_D_I:Integer = $01000000; // 成语 const NATURE_D_L:Integer = $00800000; // 习语 const NATURE_A_M:Integer = $00400000; // 数词 数语素 const NATURE_D_MQ:Integer = $00200000; // 数量词 const NATURE_D_N:Integer = $00100000; // 名词 名语素 const NATURE_D_O:Integer = $00080000; // 拟声词 const NATURE_D_P:Integer = $00040000; // 介词 const NATURE_A_Q:Integer = $00020000; // 量词 量语素 const NATURE_D_R:Integer = $00010000; // 代词 代语素 const NATURE_D_S:Integer = $00008000; // 处所词 const NATURE_D_T:Integer = $00004000; // 时间词 const NATURE_D_U:Integer = $00002000; // 助词 助语素 const NATURE_D_V:Integer = $00001000; // 动词 动语素 const NATURE_D_W:Integer = $00000800; // 标点符号 const NATURE_D_X:Integer = $00000400; // 非语素字 const NATURE_D_Y:Integer = $00000200; // 语气词 语气语素 const NATURE_D_Z:Integer = $00000100; // 状态词 const NATURE_A_NR:Integer = $00000080; // 人名 const NATURE_A_NS:Integer = $00000040; // 地名 const NATURE_A_NT:Integer = $00000020; // 机构团体 const NATURE_A_NX:Integer = $00000010; // 外文字符 const NATURE_A_NZ:Integer = $00000008; // 其他专名 const NATURE_D_H:Integer = $00000004; // 前接成分 const NATURE_D_K:Integer = $00000002; // 后接成分 implementation {$R *.dfm} function AddNatureString(strWord:string; dwPos:longint):string; begin if((dwPos and NATURE_D_A) = NATURE_D_A) then strWord := strWord +('/a')//形容词 else if((dwPos and NATURE_D_B) = NATURE_D_B) then strWord := strWord +('/b')//区别词 else if((dwPos and NATURE_D_C) = NATURE_D_C) then strWord := strWord +('/c')//连词 else if((dwPos and NATURE_D_D) = NATURE_D_D) then strWord := strWord +('/d')//副词 else if((dwPos and NATURE_D_E) = NATURE_D_E) then strWord := strWord +('/e')//叹词 else if((dwPos and NATURE_D_F) = NATURE_D_F) then strWord := strWord +('/f')//方位词 else if((dwPos and NATURE_D_I) = NATURE_D_I) then strWord := strWord +('/i')//成语 else if((dwPos and NATURE_D_L) = NATURE_D_L) then strWord := strWord +('/l')//习语 else if((dwPos and NATURE_A_M) = NATURE_A_M) then strWord := strWord +('/m')//数词 else if((dwPos and NATURE_D_MQ) = NATURE_D_MQ) then strWord := strWord +('/mq')//数量词 else if((dwPos and NATURE_D_N) = NATURE_D_N) then strWord := strWord + ('/n')//名词 else if((dwPos and NATURE_D_O) = NATURE_D_O) then strWord := strWord +('/o')//拟声词 else if((dwPos and NATURE_D_P) = NATURE_D_P) then strWord := strWord +('/p')//介词 else if((dwPos and NATURE_A_Q) = NATURE_A_Q) then strWord := strWord +('/q')//量词 else if((dwPos and NATURE_D_R) = NATURE_D_R) then strWord := strWord +('/r')//代词 else if((dwPos and NATURE_D_S) = NATURE_D_S) then strWord := strWord +('/s')//处所词 else if((dwPos and NATURE_D_T) = NATURE_D_T) then strWord := strWord +('/t')//时间词 else if((dwPos and NATURE_D_U) = NATURE_D_U) then strWord := strWord +('/u')//助词 else if((dwPos and NATURE_D_V) = NATURE_D_V) then strWord := strWord +('/v')//动词 else if((dwPos and NATURE_D_W) = NATURE_D_W) then strWord := strWord +('/w')//标点符号 else if((dwPos and NATURE_D_X) = NATURE_D_X) then strWord := strWord +('/x')//非语素字 else if((dwPos and NATURE_D_Y) = NATURE_D_Y) then strWord := strWord +('/y')//语气词 else if((dwPos and NATURE_D_Z) = NATURE_D_Z) then strWord := strWord +('/z')//状态词 else if((dwPos and NATURE_A_NR) = NATURE_A_NR) then strWord := strWord +('/nr')//人名 else if((dwPos and NATURE_A_NS) = NATURE_A_NS) then strWord := strWord +('/ns')//地名 else if((dwPos and NATURE_A_NT) = NATURE_A_NT) then strWord := strWord +('/nt')//机构团体 else if((dwPos and NATURE_A_NX) = NATURE_A_NX) then strWord := strWord +('/nx')//外文字符 else if((dwPos and NATURE_A_NZ) = NATURE_A_NZ) then strWord := strWord +('/nz')//其他专名 else if((dwPos and NATURE_D_H) = NATURE_D_H) then strWord := strWord +('/h')//前接成分 else if((dwPos and NATURE_D_K) = NATURE_D_K) then strWord := strWord +('/k')//后接成分 else strWord := strWord +('/?');//未知词性 Result:=strWord; end; procedure TForm1.Button1Click(Sender: TObject); //stdcall; var bSuccess:bool; b:LPCSTR; hHandle,DllHandle:tHANDLE; i,iExtraCalcFlag,nResultCnt:integer; strResult,strWord:string ; pWord:^SHLSegWord ; HLSplitInit:THLSplitInit; HLFreeSplit:THLFreeSplit; HLOpenSplit:THLOpenSplit; HLCloseSplit:THLCloseSplit; HLSplitWord:THLSplitWord; HLGetWordCnt:THLGetWordCnt; HLGetWordAt:THLGetWordAt; begin DllHandle:=LoadLibrary(PAnsiChar('HLSSplit.dll')); if DllHandle=0 then begin ShowMessage('Loading DLL Error!'); exit; end; @HLSplitInit:=GetProcAddress(DllHandle,'HLSplitInit'); @HLFreeSplit:=GetProcAddress(DllHandle,'HLFreeSplit'); @HLOpenSplit:=GetProcAddress(DllHandle,'HLOpenSplit'); @HLCloseSplit:=GetProcAddress(DllHandle,'HLCloseSplit'); @HLSplitWord:=GetProcAddress(DllHandle,'HLSplitWord'); @HLGetWordCnt:=GetProcAddress(DllHandle,'HLGetWordCnt'); @HLGetWordAt:=GetProcAddress(DllHandle,'HLGetWordAt'); if (Not Assigned(@HLSplitInit)) or (Not Assigned(@HLOpenSplit)) or (Not Assigned(@HLCloseSplit)) or (Not Assigned(@HLSplitWord)) or (Not Assigned(@HLGetWordCnt)) or (Not Assigned(@HLGetWordAt)) then begin showmessage('Loading function Error!'); exit; end; //这里说的是词库文件的路径,空表示当前路径 if(not HLSplitInit('')) then begin ShowMessage('海量分词初始化失败!'); Exit; end; //创建分词句柄 hHandle:= HLOpenSplit (); if(hHandle = INVALID_HANDLE_VALUE) then begin //创建分词句柄失败 ShowMessage('创建分词句柄失败!') ; HLFreeSplit () ;//卸载分词字典 exit; end ; //分词并对分词结果进行处理 iExtraCalcFlag:=HL_CAL_OPT_POS; //附加计算标志,词性计算 b:=pchar(Memo1.Text); bSuccess:= HLSplitWord (hHandle, b, iExtraCalcFlag); strResult:= ' ' ; if(bSuccess) then //分词成功 begin nResultCnt := HLGetWordCnt (hHandle);//取得分词个数 Label1.Caption:=inttostr(nResultCnt); for i:=0 to nResultCnt-1 do //取得分词结果 begin pWord:= HLGetWordAt(hHandle , i);//取得一个分词结果 if pWord.s_szWord<>'' then begin strWord:=(pWord.s_szWord); //PChar(pWord.s_szWord); if(iExtraCalcFlag=HL_CAL_OPT_POS) then//词性 strWord :=AddNatureString(strWord,pWord.s_dwPOS); //显示词性标准 strResult := strResult+strWord; strResult :=strResult+ (' ') ; //以空格分割分词结果中的每个词 end end end else //分词失败 begin ShowMessage('分词失败!') ; end; //卸载分词词典 //关闭分词句柄 HLCloseSplit(hHandle); //卸载海量分词 HLFreeSplit(); Memo2.Text:=strResult; FreeLibrary(DllHandle); end; end.
你好,运行LoadLibrary(PAnsiChar('HLSSplit.dll'))时总是返回0,请问是哪里错了吗?
这个太久没有搞了,我也不清楚了。
你用 getlasterror 看看?