实验目的与内容

设计、编制并调试一个简单语言SysY的词法分析程序,加深对词法分析原理的理解。

image-20210816212204782

程序总体设计思路和框架

image-20210816213013296

主要的数据结构和流程描述

image-20210816213140112

源代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
keyword=['auto','break','case','char','const',
'continue','default','do','double','else','enum',
'extern','float','for','goto','if','int','long',
'register','return','short','signed','sizeof','static',
'struct','switch','typedef','union','unsigned','void'
,'volatile','while']
operator = ['+','-','*','/','%','++','--','+=','-=','+=','/=',#算术运算符
'==','!=','>','<','>=','<=',#关系运算符
'&','|','^','~','<<','>>',#位运算符
'&&','||','!',#逻辑运算符
'=',#赋值运算符
]
delimiters = ['{','}','[',']','(',')','.',',',':',';']
def isIdentifier(str):
if len(str)==1:
return str[0]=='_' or str[0].encode().isalpha()
else:
return (str[0]=='_' or str[0].isalpha()) and str[1:].replace('_','').encode().isalnum()
#不加.encode()没法识别中文
# str.isdigit()
# str.isalpha()
# 处理注释'//''/*''*/'
def isKeywords(str):
if str in keyword:
return True
else:
return False

if __name__=='__main__':
identifier=[]
token=[]
# filename=input("filename:")
filename=r'test\04_const_defn.sy'
fp=open(filename,encoding='utf-8',mode='r')
txt=fp.read()
lines = re.sub(r'\/\*[\s\S]*\*\/|\/\/.*','',txt).split('\n') #用正则表达式去掉注释
# ori_lines=fp.readlines()
fp.close()
# lines_list=[line.strip('\n').replace('\\t','').split(' ') for line in lines] #按空格分割字符串
lines=[line.strip('\n').replace('\t','') for line in lines]
identifier=[]
flag=False #当遇见两位的运算符时,通过flag跳过下一个字符
for line in lines:
word=''
if line=='':
continue
#空字符串跳过,进入下一行
else:
#for ch in line:
i=-1
while(i+1<len(line)):
i=i+1
ch=line[i]
if ch==' ':
continue
# 分界符:
if ch in delimiters:
token.append(f'<delimiter,{delimiters.index(ch)},\'{ch}\'>')
# 运算符:
elif ch in operator:
next=line[i:i+2]
if next in operator:
token.append(f'<operator,{operator.index(next)},\'{next}\'>')
i=i+1 #跳过下一个循环
else:
token.append(f'<operator,{operator.index(ch)},\'{ch}\'>')
# 数字:
elif ch.isnumeric():
base=10 #默认十进制
num=ch
if(ch=='0' and i+1<len(line) and line[i+1] in 'xboXBO'):
if(i+1<len(line) and line[i+1]=='x'):
num='0x'
base=16
i=i+1
elif(i+1<len(line) and line[i+1]=='x'):
num='0o'
base=8
i=i+1
elif(i+1<len(line) and line[i+1]=='b'):
num='0b'
base=2
i=i+1
while i+1<len(line) and (line[i+1] in '0123456789abcdefABCDEF'):
num=num+line[i+1]
i=i+1
else:
if not num in '0x0o0b':
token.append(f'<num,{int(num,base)}>')
# 标识符/关键词:
elif isIdentifier(ch):
word=ch
while(i+1<len(line) and isIdentifier(line[i+1])):
word=word+line[i+1]
i=i+1
else:
if word in keyword:
token.append(f'<{word}>')
word=''
else:
if not word in identifier:
identifier.append(word)
token.append(f'<identifier,{identifier.index(word)},\'{word}\'>')
word=''
else:
print(f'Error at line {lines.index(line)+1}:不允许的字符')
continue
for i,x in enumerate(token):
print(x)

另外附一个验收时候的视频:https://www.bilibili.com/video/BV1mL411t7kQ/