很多SEO软件有泛采集功能,只需要指定关键字,就自动抓取相关文章。这种抓取技术,需要用到HTML正文抽取算法,这里分享根据cx-extractor线性算法php版编写的VB HTML正文抽取类模块。
感谢cx,感谢xwf_like。
参考资料:http://code.google.com/p/cx-extractor 以及 http://hi.baidu.com/xwf_like
'========================================
'模块名称:clsHtmlExtractor
'模块作用:从HTML中抽取正文,根据http header或者html自动获取编码
'模块编写:楚吟风 QQ:112704422 http://www.chuyinfeng.com
'模块更新:2011-03-15
'模块说明:感谢cx的基于行块分布函数的通用网页正文抽取算法
'========================================
Option Explicit
'========================================
'函数名称:ReplaceX
'函数作用:正则替换
'========================================
Public Function ReplaceX(ByVal sSource As String, ByVal sPattern As String, ByVal sTarget As String) As String
On Error GoTo ErrHandle
Dim RegEx, ReplaceTest As String, sRet As String
Set RegEx = CreateObject("VBSCRIPT.REGEXP")
RegEx.IgnoreCase = True
RegEx.Global = True
RegEx.Pattern = sPattern
sRet = RegEx.Replace(sSource, sTarget)
Set RegEx = Nothing
ReplaceX = sRet
Exit Function
ErrHandle:
Set RegEx = Nothing
End Function
'========================================
'函数名称:InstrX
'函数作用:正则查找
'========================================
Public Function InstrX(ByVal Source As String, ByVal sPattern As String, Optional ByRef strs As Variant) As Integer
On Error GoTo ErrHandle
Dim i As Integer
ReDim strs(i)
Dim RegEx, Matches, Match, sCSet As String
Set RegEx = CreateObject("VBSCRIPT.REGEXP")
RegEx.IgnoreCase = True
RegEx.Global = True
RegEx.Pattern = sPattern
If RegEx.Test(Source) Then
Set Matches = RegEx.execute(Source)
For Each Match In Matches
i = i + 1
ReDim Preserve strs(i)
strs(i) = Match.Value
Next
End If
Set Match = Nothing '
Set Matches = Nothing
Set RegEx = Nothing
InstrX = i
Exit Function
ErrHandle:
Set RegEx = Nothing
End Function
'========================================
'函数名称:GetCset
'函数作用:根据给定的字符串获取html编码方式
'========================================
Public Function GetCset(ByVal Source As String) As String
Dim i As Integer, strs() As String, sCSet As String
i = InstrX(Source, "content-type.*?charset.*?=.*", strs)
If i > 0 Then sCSet = strs(1)
sCSet = ReplaceX(sCSet, ".*charset.*?=", "")
sCSet = ReplaceX(sCSet, """|\s|/|>", "")
GetCset = sCSet
End Function
'========================================
'函数名称:LenX
'函数作用:把全角字符做为2字节计算长度,忽略空格长度
'========================================
Private Function LenX(ByVal s_str As String) As Integer
Dim i_num As Integer, i_index As Integer, i_len As Integer
s_str = Replace(s_str, " ", "")
i_len = Len(s_str)
For i_index = 1 To i_len
If Asc(Mid(s_str, i_index, 1)) < 0 Then
i_num = i_num + 1
End If
Next
LenX = i_len + i_num
End Function
'========================================
'函数名称:Extract
'函数作用:根据cx-extractor算法抽取正文
'========================================
Public Function Extract(ByVal Source As String, Optional ByVal BlockLine As Integer = 3, Optional ByVal OneLine As Boolean = True)
Dim sLine() As String, iLine() As Long, i As Integer, iBlockLen() As Long, sBlock() As String
Dim iStart As Long, iEnd As Long, iMaxLen As Long, iTemp As Long
Dim sPortion As String, iCurTextLen As Long, sTemp As String, sOneLine As String
sOneLine = IIf(OneLine, "", vbCrLf)
'初步去噪
'去除DTD信息
Source = ReplaceX(Source, "<!DOCTYPE.*?>", "")
'去除注释
Source = ReplaceX(Source, "<!--(.|\n)*?-->", "")
'去除script标签
Source = ReplaceX(Source, "<script.*?>(.|\n)*?<\/script>", "")
'去除style标签
Source = ReplaceX(Source, "<style.*?>(.|\n)*?<\/style>", "")
'去除html tag标签
Source = ReplaceX(Source, "<(.|\n)*?>", "")
'去除特殊字符
Source = ReplaceX(Source, "&.{1,5};|&#.{1,5};", "")
'规范换行
Source = Replace(Source, vbCrLf, vbLf)
Source = Replace(Source, vbCr, vbLf)
Source = Replace(Source, vbLf, vbCrLf)
'分割到行
sLine = Split(Source, vbCrLf)
ReDim iBlockLen(0)
For i = 0 To UBound(sLine)
'将多个空白字符替换为一个空格
sLine(i) = ReplaceX(sLine(i), "\s+", " ")
Next
'计算第一块大小
For i = 0 To (BlockLine - 1)
iBlockLen(0) = iBlockLen(0) + LenX(sLine(i))
Next
'计算其他块大小
For i = 1 To UBound(sLine) - BlockLine - 1
ReDim Preserve iBlockLen(i)
iBlockLen(i) = iBlockLen(i - 1) + LenX(sLine(i - 1 + BlockLine)) - LenX(sLine(i - 1))
Next
'根据各个块大小变化的峰值峰谷提取正文
iStart = -1: iEnd = -1: i = 0
Do While i < UBound(iBlockLen)
Do While (i < UBound(iBlockLen) And iBlockLen(i) = 0)
i = i + 1
Loop
iTemp = i
iCurTextLen = 0
sPortion = ""
Do While (i < UBound(iBlockLen) And iBlockLen(i) <> 0)
sPortion = sPortion & sLine(i) & sOneLine
iCurTextLen = iCurTextLen + iBlockLen(i)
i = i + 1
Loop
If iCurTextLen > iMaxLen Then
sTemp = sPortion
iMaxLen = iCurTextLen
iStart = iTemp
iEnd = i - 1
End If
Loop
'MsgBox sLine(iStart - 1), , iStart
'MsgBox sLine(iEnd + 1), , iEnd
Extract = sTemp
End Function