from conllu import parse_incr
uniqueFeatureDict = {}
contextDict = {}
tagtype = 'upos'
data_file = open("UD_Spanish-AnCora/es_ancora-ud-train.conllu", "r", encoding="utf-8")
#Calculando conteos (pre-probabilidades)
for tokenlist in parse_incr(data_file):
prevtag = "None"
for token in tokenlist:
tag = token[tagtype]
word = token['form'].lower()
#c(tag|word,prevtag)
largeKey = tag+'|'+word+','+prevtag
if largeKey in uniqueFeatureDict.keys():
uniqueFeatureDict[largeKey]+=1
else:
uniqueFeatureDict[largeKey]=1
key = word+','+prevtag
if key in contextDict.keys():
contextDict[key]+=1
else:
contextDict[key]=1
#print(largeKey, key, '\n')
prevtag=tag
¿Quieres ver más aportes, preguntas y respuestas de la comunidad?