1G多的XML转档

问题:
   存在一个1G多的XML文件,需要将其中的数据,按照一定的逻辑提取数据,做一定的格式化处理和处理逻辑,生成3个不同的文本文件

解决:
   做XML的转换,有如下的几个选择:1. DOM + XSL ;2. DOM + 自己解析 ; 3:SAX + 自己解析。因为担心performance的问题,所以决定采用SAX + Stack的方式来处理。因为要生成3个文件,所以设计3个类来负责分别产生3个文件。然后通过链表的方式串接起来,让他们截获自己concern的内 容,做进一步的处理。最后各自产生自己负责的文件。


其中:
XmlTextReader : VB.NET的SAX XML Reader。他会逐一读到XML的指令,COMMENT,开始TAG,文本,结束TAG等内容

Processor : 解析XML,产生文本文件的处理器接口。里面定义了5个方法。他们分别是,open():用来打开文件;close():用来关闭文 件;startTag():当XmlTextReader读到开始TAG的时候,就会呼叫Processor的这个方法,将TAG名字,Tag的 Stack,Value的Stack作为参数传入;endTag():当XmlTextReader读到结束TAG的时候,就会呼叫Processor的 这个方法,将TAG名字,Tag的Stack,Value的Stack作为参数传入;text():当XmlTextReader读到文本内容的时候,就 会呼叫Processor的这个方法,将文本值,Tag的Stack,Value的Stack作为参数传入

Executor : 作为XmlTextReader和各种Processor交互的桥梁。在他的内部,各种不同的Processor会以链表的形式组合在一块儿。当 XmlTextReader读到开始Tag,文本,结束Tag的时候,就会去invoke Excutor的相应方法,而Executor则会逐一invoke 各种processor相应方法,若其中有一procceor处理了某个请求,则终止向后续传。最后一个DefaultProcessor,他的存在只是 为了保证tagStask,valueStack的完整性而已。

TProcessor,DProcessor,PProcessor : 负责接收只有自己感兴趣的xml内容,然后转交给TagWrapper,做格式化,业务逻辑处理而已。在startTag(),text()方法里面,处 理逻辑是,如果是自己关心的内容,则直接入栈;在endTag()方法里面,处理逻辑是,如果是自己关心的内容,则判断是否到了一条记录生成完成的时候 (根据endTag标志),如果是则将记录写入到文件当中,否则将startTag出栈,valueTag出栈,将value,tag名字交给 TagWrapper链表,做内部的格式化和业务逻辑处理。

程序的流程如下:


部分程序代码(VB.NET)

Main的部分代码
Dim m_xmlr As XmlTextReader

Dim tagStack As New Stack

Dim valueStack As New Stack

Dim executor As Processor = New Executor("c:\TOTFA", "c:\DTLFA", "c:\ORDFA")
executor.open()
m_xmlr = New XmlTextReader("c:\test2.xml")
m_xmlr.WhitespaceHandling = WhitespaceHandling.None
While m_xmlr.Read()
     Select Case m_xmlr.NodeType
           Case XmlNodeType.XmlDeclaration
                Continue While

          Case XmlNodeType.ProcessingInstruction
               Continue While

          Case XmlNodeType.Element
               executor.startTag(m_xmlr.Name, tagStack, valueStack)

          Case XmlNodeType.EndElement
               executor.endTag(m_xmlr.Name, tagStack, valueStack)

          Case XmlNodeType.Text
               executor.text(m_xmlr.Value, tagStack, valueStack)
     End Select
End While
m_xmlr.Close()
executor.close()

 

Executor的代码
Public Class Executor Implements Processor

Private list As New List(Of Processor)

Public Sub New(ByVal TOTFA As String, ByVal DTLFA As String, ByVal ORDFA As String)
     Dim tp As TProcessor = New TProcessor(TOTFA)
     Dim dp As DProcessor = New DProcessor(DTLFA, tp)
     Dim pp As PProcessor = New PProcessor(ORDFA, tp, dp)
     list.Add(tp)
     list.Add(dp)
     list.Add(pp)
     list.Add(New DefaultProcessor())
End Sub

Function startTag(ByVal tagName As String, ByRef tagStack As Stack, ByRef valueStack As Stack) As      Boolean Implements Processor.startTag
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          If (processor.startTag(tagName, tagStack, valueStack)) Then
               Return True
          End If
     Next
     Return True
End Function

Function endTag(ByVal tagName As String, ByRef tagStack As Stack, ByRef valueStack As Stack) As      Boolean Implements Processor.endTag
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          If (processor.endTag(tagName, tagStack, valueStack)) Then
               Return True
          End If
     Next
     Return True
End Function

Function text(ByVal value As String, ByRef tagStack As Stack, ByRef valueStack As Stack) As                Boolean Implements Processor.text
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          If (processor.text(value, tagStack, valueStack)) Then
               Return True
          End If
     Next
     Return True
End Function

Sub open() Implements Processor.open
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          processor.open()
     Next
End Sub

Sub close() Implements Processor.close
     Dim index As Integer = list.Count
     For index = 0 To list.Count - 1
          Dim processor As Processor = list.Item(index)
          processor.close()
     Next
End Sub
End Class

 

TProcessor代码
Imports System.IO
Imports System.Text
Imports System.Text.RegularExpressions

Public Class TProcessor Implements Processor

Private tagWrapperList As New List(Of TagWrapper)

Private sw As StreamWriter

Private filename As String

Public Sub New(ByVal filename As String)
     Me.filename = filename
     tagWrapperList.Add(New TagWrapper("t1", "栏位说明", " ", 2, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t2", "栏位说明", " ", 10, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t3", "栏位说明", "00000", 5, 0, FormatterFactory.getInstance().getYearMonthFormatter()))
     tagWrapperList.Add(New TagWrapper("t4", "栏位说明", " ", 1, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t5", "栏位说明", " ", 1, 0, FormatterFactory.getInstance().getRBlankFormatter()))
     tagWrapperList.Add(New TagWrapper("t6", "栏位说明", "0000000", 7, 0, FormatterFactory.getInstance().getYearMonthDateFormatter()))
     tagWrapperList.Add(New TagWrapper("t7", "栏位说明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t8", "栏位说明", "0000000000", 10, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t9", "栏位说明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t10", "栏位说明", "0000000000", 10, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t11", "栏位说明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t12", "栏位说明", "0000000000", 10, 0, FormatterFactory.getInstance().getRZeroFormatter()))
     tagWrapperList.Add(New TagWrapper("t13", "栏位说明", "000000", 6, 0, FormatterFactory.getInstance().getRZeroFormatter()))
End Sub

Public Sub close() Implements Processor.close
     If Not (sw Is Nothing) Then
          sw.Close()
     End If
End Sub

Public Function endTag(ByVal tagName As String, ByRef tagStack As System.Collections.Stack, ByRef valueStack As System.Collections.Stack) As Boolean Implements Processor.endTag
     ‘匹配TProcessor所关心的代码
     If (Not Regex.IsMatch(tagName, "tdata|t\d+") Or String.Compare(tagName, tagStack.Peek) <> 0) Then
          Return False
     End If

     If (String.Compare(tagStack.Peek, "tdata") = 0 And String.Compare(tagName, tagStack.Peek) = 0) Then
          tagStack.Pop()
          flush()
     ElseIf (valueStack.Count <> 0 And String.Compare(tagStack.Peek, tagName) = 0) Then
          execute(tagStack.Pop(), valueStack.Pop())
     End If
     Return True
End Function

Public Sub open() Implements Processor.open
     sw = New StreamWriter(Me.filename)
End Sub

Public Function startTag(ByVal tagName As String, ByRef tagStack As System.Collections.Stack, ByRef valueStack As System.Collections.Stack) As Boolean Implements Processor.startTag
     If (Not Regex.IsMatch(tagName, "tdata|t\d+")) Then
          Return False
     End If
     tagStack.Push(tagName)
     Return True
End Function

Public Function text(ByVal value As String, ByRef tagStack As System.Collections.Stack, ByRef valueStack As System.Collections.Stack) As Boolean Implements Processor.text
     If (Not Regex.IsMatch(tagStack.Peek, "tdata|t\d+")) Then
          Return False
     End If
     valueStack.Push(value)
     Return True
End Function


Private Sub flush()
     Dim ret As StringBuilder = New StringBuilder("")
     Dim index As Integer
     For index = 0 To tagWrapperList.Count - 1
          Dim tag As TagWrapper = tagWrapperList.Item(index)
          ret.Append(tag.getFormatValue())
     Next
     sw.Write(ret.ToString)
     sw.Flush()
End Sub

Private Sub execute(ByVal tagName As String, ByVal value As String)
     For index = 0 To tagWrapperList.Count - 1
          Dim wrapper As TagWrapper = tagWrapperList.Item(index)
          If (wrapper.accept(tagName)) Then
               wrapper.Value = value
               Exit Sub
          End If
     Next
End Sub

End Class
 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章