自制編譯器：詞法分析器

詞法分析器代碼已上傳到個人資源中。

當我們的程序源文件進入編譯器，首先遇到的就是詞法分析器。

詞法分析器的作用就是解析源文件，分析出其中的詞素，並把這個詞素的順序集輸入給語法分析器。

接上篇把所謂的詞素也就是終結符號列出來：

if else while ( ) { } cpreop bitop logiop armtcop number literal id NUL new [ ] basetype class private public static return break continue . this

其中cprop包括 > < >= <= == != 即比較運算符

bitop 爲位運算符，包括<< >> & | ^

logiop 邏輯運算符包括 && ||

armtcop 算數運算符包括 + - * /

number 數字常量例如12345整形火 1.2345小數

id 標識符按java規則

literal 字符串常量如"ROgerwong"

NUL 空串

basetype 基本類型包括 int char double 三種

當然，爲了簡單，在這裏並不打算討論非確定有窮自動機和確定有窮自動機的理論以及其之間的轉換算法，只是用最樸素的方法，不斷的將字符讀入緩衝區，然後和這些詞素進行比較，然後把這個詞素加入到一個ArrayList中。

按着這個方法定義幾個數據結構：

定義詞素數據結構，共含兩個域，1個表示類型，一個表示具體的值，類型的取值也已經標出。

package ravaComplier.lexer;

public class Lexeme { public int type; public Object value; public Lexeme(int t,Object v) { type=t; value=v; } @Override public String toString() { return new String("<"+type+":"+value.toString()+">"); } public static int IF=0;//if public static int ELSE=1;//else public static int WHILE=2;//while public static int BRACKET=3;//各種括號 public static int CPREOP=4;//比較符號 public static int BITOP=5;//位操作符 public static int LOGIOP=6;//邏輯運算符 public static int ARMTOP=7;//算術運算符 public static int NUMBER=8;//立即數 public static int LITERAL=9;//字符串 public static int ID=10;//id public static int NUL=11;//空 public static int NEW=12;//new 操作符 public static int BASETYPE=13;//基本數據類型 public static int CLASS=14;//關鍵字class public static int ACCESSFLAG=15;//public 或者private public static int STATIC=16;//關鍵字static public static int RETURN=17;//關鍵字return public static int BREAK=18;//break public static int CONTINUE=19;//continue public static int DOT=20;//. public static int THIS=21;//關鍵字this public static int SEMI=22;//分號 public static int EQUAL=23;//等號 }

其次，因爲是用樸素的笨辦法，所以我們需要構造規則：

定義分隔符：空格、製表符、換行符、+、-、*、/、.、；、各種括號運算符等。

若遇到分隔符，則分隔符前面的緩衝區爲一個詞素，分隔符爲一個詞素（空格、製表符、換行符）除外。

但注意特殊情況，若遇到>和>=，&和&& 之類的，需要多向前看一個字符來確定詞素。

然後再把分割出的詞素實例化成Lexeme類型，並加入到返回結果中。

代碼很簡單，但寫起來比較費事：

package ravaComplier.lexer;

import java.io.*;
import java.util.*;

public class Lexer {
	private static ArrayList<Lexeme> result;//返回的結果
	private static BufferedReader br;
	private static StringBuffer buffer;//緩衝區

	public static ArrayList<Lexeme> getLexerOutput(InputStream is)
	{
		result=new ArrayList<Lexeme>();
		br=new BufferedReader(new InputStreamReader(is));
		buffer=new StringBuffer();
		while(Read())
		{
			addLexeme();
		}
		return result;
	}
	//嘗試將緩衝區分解出詞素並加入詞素集合
	private static void addLexeme()
	{
		String str=buffer.toString();
		String endstr=str.substring(str.length()-1,str.length());
		//判斷單字符的分割符號
		if(endstr.equals(" ") || endstr.equals("\t")  || endstr.equals(";") || endstr.equals("{") || endstr.equals("}") || endstr.equals("(") || endstr.equals(")") || endstr.equals("[") || endstr.equals("]") || endstr.equals("+") || endstr.equals("-") || endstr.equals("*") || endstr.equals("/") )
		{
			Lexeme lex=getLexeme(str.substring(0,str.length()-1));
			if(lex!=null)
			{
				result.add(lex);
			}
			lex=getLexeme(endstr);
			if(lex!=null)
			{
				result.add(lex);
			}
			
			buffer=new StringBuffer();
		}
		//判斷雙字符的分割符號
		if(str.length()>=2)
		{
			endstr=str.substring(str.length()-2,str.length());
			if(endstr.equals(">=") ||endstr.equals("<=") ||endstr.equals("==") || endstr.equals("||") ||endstr.equals("&&") || endstr.equals("!=") ||endstr.equals("\r\n"))
			{
				Lexeme lex=getLexeme(str.substring(0,str.length()-2));
				if(lex!=null)
				{
					result.add(lex);
				}
				lex=getLexeme(endstr);
				if(lex!=null)
				{
					result.add(lex);
				}
				
				buffer=new StringBuffer();
			}
			else if(endstr.charAt(0)=='=' || endstr.charAt(0)=='>' || endstr.charAt(0)=='<' || endstr.charAt(0)=='&' || endstr.charAt(0)=='|' )
			{
				Lexeme lex=getLexeme(str.substring(0,str.length()-2));
				if(lex!=null)
				{
					result.add(lex);
				}
				lex=getLexeme(endstr.substring(0,1));
				if(lex!=null)
				{
					result.add(lex);
				}
				
				buffer=new StringBuffer();
				buffer.append(endstr.charAt(1));
			}
		}
	}
	//根據一個字符串獲取詞素
	private static Lexeme getLexeme(String lex)
	{
		Lexeme result=null;
		if(lex.equals(" ") || lex.equals("\t") || lex.equals("\r\n") || lex==null|| lex.length()==0)
		{
			return null;
		}
		if(lex.equals("if"))
		{
			result=new Lexeme(Lexeme.IF,lex);
		}
		else if(lex.equals("else"))
		{
			result=new Lexeme(Lexeme.ELSE,lex);
		}
		else if(lex.equals("while"))
		{
			result=new Lexeme(Lexeme.WHILE,lex);
		}
		else if(lex.equals("{") || lex.equals("}")|| lex.equals("[") || lex.equals("]") || lex.equals("(") || lex.equals(")"))
		{
			result=new Lexeme(Lexeme.BRACKET,lex);
		}
		else if(lex.equals(">") || lex.equals("<") || lex.equals("==") || lex.equals(">=") || lex.equals("<=") || lex.equals("!="))
		{
			result=new Lexeme(Lexeme.CPREOP,lex);
		}
		else if(lex.equals("&") || lex.equals("|") || lex.equals("^"))
		{
			result=new Lexeme(Lexeme.BITOP,lex);
		}
		else if(lex.equals("&&") || lex.equals("||"))
		{
			result=new Lexeme(Lexeme.LOGIOP,lex);
		}
		else if(lex.equals("+") || lex.equals("-") || lex.equals("*") || lex.equals("/"))
		{
			result=new Lexeme(Lexeme.ARMTOP,lex);
		}
		else if(isNumber(lex))
		{
			result=new Lexeme(Lexeme.NUMBER,lex);
		}
		else if(isStr(lex))
		{
			result=new Lexeme(Lexeme.LITERAL,lex);
		}
		else if(lex.equals("new"))
		{
			result=new Lexeme(Lexeme.NEW,lex);
		}
		else if(lex.equals("int") || lex.equals("char") || lex.equals("double"))
		{
			result=new Lexeme(Lexeme.BASETYPE,lex);
		}
		else if(lex.equals("class"))
		{
			result=new Lexeme(Lexeme.CLASS,lex);
		}
		else if(lex.equals("private") || lex.equals("public"))
		{
			result=new Lexeme(Lexeme.ACCESSFLAG,lex);
		}
		else if(lex.equals("static"))
		{
			result=new Lexeme(Lexeme.STATIC,lex);
		}
		else if(lex.equals("return"))
		{
			result=new Lexeme(Lexeme.RETURN,lex);
		}
		else if(lex.equals("break"))
		{
			result=new Lexeme(Lexeme.BREAK,lex);
		}
		else if(lex.equals("continue"))
		{
			result=new Lexeme(Lexeme.CONTINUE,lex);
		}
		else if(lex.equals("."))
		{
			result=new Lexeme(Lexeme.DOT,lex);
		}
		else if(lex.equals("this"))
		{
			result=new Lexeme(Lexeme.THIS,lex);
		}
		else if(lex.equals(";"))
		{
			result=new Lexeme(Lexeme.SEMI,lex);
		}
		else if(lex.equals("="))
		{
			result=new Lexeme(Lexeme.EQUAL,lex);
		}
		else
		{
			result=new Lexeme(Lexeme.ID,lex);
		}
		return result;
	}
	private static boolean isStr(String lex)
	{
		if(lex.charAt(0)!='\"' || lex.charAt(lex.length()-1)!='\"')
			return false;
		for(int i=1;i<=lex.length()-2;i++)
		{
			if(lex.charAt(i)=='\"')
			{
				return false;
			}
		}
		return true;
	}
	private static boolean isNumber(String str)
	{
		try
		{
			int i=Integer.valueOf(str);
			return true;
		}
		catch(Exception e)
		{}
		try
		{
			double j=Double.valueOf(str);
			return true;
		}
		catch(Exception e)
		{}
		return false;
	}
	//從流中讀取一個字符
	private static boolean Read()
	{
		int d;
		try {
			d = br.read();
			if(d==-1)
			{
				return false;
			}
			buffer.append((char)d);
		} catch (IOException e) {
			e.printStackTrace();
			return false;
		}
		
		
		return true;
	}
}

然後自己寫一段程序，試一試能不能正確的解析：

class testclass{
   private static int j=0;
   public int i=1;
   public testclass()
  {
     double c=1;
     char[] d="123456";
      
  }
  private static double func1()
  {
     if(j==0)
     {
     return 1.5
     }
     else
     {
       while(i<=10)
       {
         i=i+1;
       }
       return i;
     }  
   }
}

然後看一看輸出的結果：

<14:class>
<10:testclass>
<3:{>
<15:private>
<16:static>
<13:int>
<10:j>
<23:=>
<8:0>
<22:;>
<15:public>
<13:int>
<10:i>
<23:=>
<8:1>
<22:;>
<15:public>
<10:testclass>
<3:(>
<3:)>
<3:{>
<13:double>
<10:c>
<23:=>
<8:1>
<22:;>
<13:char>
<3:[>
<3:]>
<10:d>
<23:=>
<9:"123456">
<22:;>
<3:}>
<15:private>
<16:static>
<13:double>
<10:func1>
<3:(>
<3:)>
<3:{>
<0:if>
<3:(>
<10:j>
<4:==>
<8:0>
<3:)>
<3:{>
<17:return>
<8:1.5>
<3:}>
<1:else>
<3:{>
<2:while>
<3:(>
<10:i>
<4:<=>
<8:10>
<3:)>
<3:{>
<10:i>
<23:=>
<10:i>
<7:+>
<8:1>
<22:;>
<3:}>
<17:return>
<10:i>
<22:;>
<3:}>
<3:}>
<3:}>

貌似比較正確