打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
SEO之泛采集——HTML正文抽取算法

很多SEO软件有泛采集功能,只需要指定关键字,就自动抓取相关文章。这种抓取技术,需要用到HTML正文抽取算法,这里分享根据cx-extractor线性算法php版编写的VB HTML正文抽取类模块。

感谢cx,感谢xwf_like。

参考资料:http://code.google.com/p/cx-extractor 以及         http://hi.baidu.com/xwf_like

'========================================
'模块名称:clsHtmlExtractor
'模块作用:从HTML中抽取正文,根据http header或者html自动获取编码
'模块编写:楚吟风 QQ:112704422  http://www.chuyinfeng.com
'模块更新:2011-03-15
'模块说明:感谢cx的基于行块分布函数的通用网页正文抽取算法
'========================================

Option Explicit

'========================================
'函数名称:ReplaceX
'函数作用:正则替换
'========================================
Public Function ReplaceX(ByVal sSource As String, ByVal sPattern As String, ByVal sTarget As String) As String
On Error GoTo ErrHandle
    Dim RegEx, ReplaceTest As String, sRet As String
    Set RegEx = CreateObject("VBSCRIPT.REGEXP")
    RegEx.IgnoreCase = True
    RegEx.Global = True
    RegEx.Pattern = sPattern
    sRet = RegEx.Replace(sSource, sTarget)
    Set RegEx = Nothing
    ReplaceX = sRet
    Exit Function
ErrHandle:
    Set RegEx = Nothing
End Function

'========================================
'函数名称:InstrX
'函数作用:正则查找
'========================================
Public Function InstrX(ByVal Source As String, ByVal sPattern As String, Optional ByRef strs As Variant) As Integer
On Error GoTo ErrHandle
    Dim i As Integer
    ReDim strs(i)
    Dim RegEx, Matches, Match, sCSet As String
    Set RegEx = CreateObject("VBSCRIPT.REGEXP")
    RegEx.IgnoreCase = True
    RegEx.Global = True
    RegEx.Pattern = sPattern
    If RegEx.Test(Source) Then
        Set Matches = RegEx.execute(Source)
        For Each Match In Matches
            i = i + 1
            ReDim Preserve strs(i)
            strs(i) = Match.Value
        Next
    End If
    Set Match = Nothing '
    Set Matches = Nothing
    Set RegEx = Nothing
   
    InstrX = i
    Exit Function
ErrHandle:
    Set RegEx = Nothing
End Function

'========================================
'函数名称:GetCset
'函数作用:根据给定的字符串获取html编码方式
'========================================
Public Function GetCset(ByVal Source As String) As String
    Dim i As Integer, strs() As String, sCSet As String
    i = InstrX(Source, "content-type.*?charset.*?=.*", strs)
    If i > 0 Then sCSet = strs(1)
    sCSet = ReplaceX(sCSet, ".*charset.*?=", "")
    sCSet = ReplaceX(sCSet, """|\s|/|>", "")
    GetCset = sCSet
End Function

 

'========================================
'函数名称:LenX
'函数作用:把全角字符做为2字节计算长度,忽略空格长度
'========================================
Private Function LenX(ByVal s_str As String) As Integer
    Dim i_num As Integer, i_index As Integer, i_len As Integer
    s_str = Replace(s_str, " ", "")
    i_len = Len(s_str)
    For i_index = 1 To i_len
        If Asc(Mid(s_str, i_index, 1)) < 0 Then
            i_num = i_num + 1
        End If
    Next
    LenX = i_len + i_num
End Function


'========================================
'函数名称:Extract
'函数作用:根据cx-extractor算法抽取正文
'========================================
Public Function Extract(ByVal Source As String, Optional ByVal BlockLine As Integer = 3, Optional ByVal OneLine As Boolean = True)
    Dim sLine() As String, iLine() As Long, i As Integer, iBlockLen() As Long, sBlock() As String
    Dim iStart As Long, iEnd As Long, iMaxLen As Long, iTemp As Long
    Dim sPortion As String, iCurTextLen As Long, sTemp As String, sOneLine As String
    sOneLine = IIf(OneLine, "", vbCrLf)
   
   
    '初步去噪
   
    '去除DTD信息
    Source = ReplaceX(Source, "<!DOCTYPE.*?>", "")
    '去除注释
    Source = ReplaceX(Source, "<!--(.|\n)*?-->", "")
    '去除script标签
    Source = ReplaceX(Source, "<script.*?>(.|\n)*?<\/script>", "")
    '去除style标签
    Source = ReplaceX(Source, "<style.*?>(.|\n)*?<\/style>", "")
    '去除html tag标签
    Source = ReplaceX(Source, "<(.|\n)*?>", "")
    '去除特殊字符
    Source = ReplaceX(Source, "&.{1,5};|&#.{1,5};", "")
   
   
    '规范换行
    Source = Replace(Source, vbCrLf, vbLf)
    Source = Replace(Source, vbCr, vbLf)
    Source = Replace(Source, vbLf, vbCrLf)
   
    '分割到行
    sLine = Split(Source, vbCrLf)
    ReDim iBlockLen(0)
    For i = 0 To UBound(sLine)
        '将多个空白字符替换为一个空格
        sLine(i) = ReplaceX(sLine(i), "\s+", " ")
    Next
   
    '计算第一块大小
    For i = 0 To (BlockLine - 1)
        iBlockLen(0) = iBlockLen(0) + LenX(sLine(i))
    Next
   
    '计算其他块大小
    For i = 1 To UBound(sLine) - BlockLine - 1
        ReDim Preserve iBlockLen(i)
        iBlockLen(i) = iBlockLen(i - 1) + LenX(sLine(i - 1 + BlockLine)) - LenX(sLine(i - 1))
    Next
   
    '根据各个块大小变化的峰值峰谷提取正文
    iStart = -1: iEnd = -1: i = 0
   
   
    Do While i < UBound(iBlockLen)
        Do While (i < UBound(iBlockLen) And iBlockLen(i) = 0)
            i = i + 1
        Loop
       
        iTemp = i
        iCurTextLen = 0
        sPortion = ""
       
        Do While (i < UBound(iBlockLen) And iBlockLen(i) <> 0)
            sPortion = sPortion & sLine(i) & sOneLine
            iCurTextLen = iCurTextLen + iBlockLen(i)
            i = i + 1
        Loop
       
        If iCurTextLen > iMaxLen Then
            sTemp = sPortion
            iMaxLen = iCurTextLen
            iStart = iTemp
            iEnd = i - 1
        End If
       
    Loop
    'MsgBox sLine(iStart - 1), , iStart
    'MsgBox sLine(iEnd + 1), , iEnd
    Extract = sTemp
End Function

本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
自用的一个vb类
正则表达式对象和RegExp对象
判断身份证是否合法
去掉[]中的英文字符
把123456789转换为12-345-6789的三种方法
C#处理空格和换行
更多类似文章 >>
生活服务
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服