####################################################################
# Program fezan la transqripsion otomatiq d'un teqst 
#   de l'ortograf qonservatris ver l'ortograf alternative.
#
# Mario Periard     2007-06-08
#
####################################################################

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os, string, re, locale

locale.getlocale()
# -*- coding: latin_1 -*-

moPresedanEmaj = False
moKouranEmaj = False
ordLetre = [65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,
            84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,
            107,108,109,110,111,112,113,114,115,116,117,118,119,120,
            122,192,193,194,195,196,197,198,199,200,201,202,203,204,
            205,206,207,208,209,210,211,212,213,214,215,216,217,218,
            219,220,222,223,224,225,226,227,228,229,230,231,232,233,
            234,235,236,237,238,239,240,241,242,243,244,245,246,247,
            248,249,250,251,252,253]

# vouayel plus h
ordVouayel = [97, 101, 105, 111, 117, 121, 65, 69, 73, 79, 85, 89, 224, 226, 228, 232, 233, 234, 235, 236, 238, 239, 242, 244, 246, 249, 251, 252, 192, 194, 196, 200, 201, 202, 203, 204,206, 207, 210, 212, 214, 217, 219, 220, 72, 104]
ordAksanMin = [224, 226, 228, 232, 233, 234, 235, 236, 238, 239, 242, 244, 246, 249, 251, 252, 231]
ordAksanMaj = [192, 194, 196, 200, 201, 202, 203, 204, 206, 207, 210, 212, 214, 217, 219, 220, 199]
nazal = ['an', 'in', 'on']
terminezon = ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'x', 'z']
moLiezonAvan = ['en', 'sous', 'dans', 'sans', 'dont', 'plusieurs', 'leurs', 'divers', 'plus', 'tous', 'tout']
moLiezonApre = ['a', 'en', 'i', 'o', 'u', 'y']
verbEtre = ['suis', 'es', 'est', 'sommes', '\x88tes', 'sont', '\x82tais', '\x82tait', '\x82tions', '\x82tiez', '\x82taient', 'fus', 'fut', 'f\x96mes', 'f\x96tes', 'furent', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'f\x96t', 'fussions', 'fussiez', 'fussent', 'serais', 'serait', 'serions', 'seriez', 'seraient', '\x82tant']
eksepFinalN = ['on', 'mon', 'ton', 'son', 'en', 'un']

####################################################################
# Ouvertur de fichiex
####################################################################

fichie_sours = open("tesours.txt")
fichie_h = open("h_aspire.txt")
fichie_sibl = open("tesibl.txt", "w")
fichie_diko = open("diko-l.txt")


####################################################################
# Konstruksion de la liste de H aspire
####################################################################

kontenu_lu = fichie_h.read()
moxHaspire = set()
p = re.split('[\n\r]', kontenu_lu)
for elem in p :
	moxHaspire.add(elem)

fichie_h.close()


####################################################################
# Invanter de mo du tekst sours
####################################################################

kontenu_lu = fichie_sours.read()
ansanble_mo_teqst = set()
p = re.split('[\ \'\t\";:\-\n\.\,\!\r\?\x92]', kontenu_lu)
# metre le mo an minuskul
for elem in p :
	if ((len(elem) > 0) and (ord(elem[0]) in ordAksanMaj)) :
		debuMo = elem[0]
		ranOrdAksan = ordAksanMaj.index(ord(elem[0]))
		elem0Min = ordAksanMin[ranOrdAksan]
		elem = elem.replace(debuMo, chr(elem0Min), 1)

	else :
		elem = elem.lower()

	ansanble_mo_teqst.add(elem)

fichie_sours.close()


####################################################################
# Recherche de l'ortograf alternative
####################################################################

diko = fichie_diko.readlines()
diko_tmp = {}

for lign_diko in diko :
	chanx = lign_diko.split("\t")
	ortho = chanx[0]
	orto  = chanx[1]
	kateg = chanx[2]
	nonbr = chanx[3]
	if ortho.istitle() :
		ortho = ortho.lower()

	elif ord(ortho[0]) in ordAksanMaj :
		debuMo = ortho[0]
		ranOrdAksan = ordAksanMaj.index(ord(debuMo))
		ortho0Min = ordAksanMin[ranOrdAksan]
		ortho = ortho.replace(debuMo, chr(ortho0Min), 1)

	if ortho in ansanble_mo_teqst :
		diko_tmp[ortho] = [orto, kateg, nonbr]
		

####################################################################
# Ranplaseman de mox 
####################################################################

konter = 0
sepKouran = ""
sepPresedan = ""
moKouran = ""
moPresedan = ""
moKouranO = ""
moPresedanO = ""
karakter = kontenu_lu[konter]


# Tan k'il i a de karakter...
while (konter < len(kontenu_lu) - 1) :
	

       	# Tan ke s'e une letre...
	while ord(karakter) in ordLetre and konter < len(kontenu_lu) - 1 :
		moKouran = moKouran + karakter
		konter = konter + 1
		karakter = kontenu_lu[konter]	

	if ord(moKouran[0]) in ordAksanMaj  : 
		debuMo = moKouran[0]
		ranOrdAksan = ordAksanMaj.index(ord(moKouran[0]))
		moKouran0Min = ordAksanMin[ranOrdAksan]
		moKouran = moKouran.replace(debuMo, chr(moKouran0Min), 1)
		moKouranEmaj = True

	elif (ord(moKouran[0]) <= 90) and (ord(moKouran[0]) >= 65) :
		moKouran = moKouran.lower()
		moKouranEmaj = True

	else :
		moKouranEmaj = False

	if (moKouran in diko_tmp) :
		moKouranO = diko_tmp[moKouran][0]
		katKouran = diko_tmp[moKouran][1][0:3]
		nbrKouran = diko_tmp[moKouran][2]

	else :
		moKouranO = moKouran
 		katKouran = "???"
		nbrKouran = "s"

	
       	# Tan ke s'e n'e pa une letre...

	karakter = kontenu_lu[konter]
	while (not (ord(karakter) in ordLetre)) and konter < len(kontenu_lu) - 1 :
		sepKouran  = sepKouran  + karakter
		konter = konter + 1
		karakter = kontenu_lu[konter]

	if (moPresedan != "") and  (moKouran != "") and ((sepPresedan == ' ') or (sepPresedan == '-')):
                                ####################################################################
		# kategorizasion dex determinanx
                                ####################################################################
		if moPresedan in moLiezonAvan :
			katPresedan = 'SP1'
		if moKouran in moLiezonApre :
			katKouran = 'SP2'

		if (katKouran == 'ADJ') or (katKouran == 'ART') :
			katKouran = 'DET'

		if (katPresedan == 'ADJ') or (katPresedan == 'ART') :
			katPresedan = 'DET'

                                ####################################################################
		# Treteman dex liezon
                                #################################################################### 
		finMoPres = moPresedan[len(moPresedan) - 1]
		finMoPresO = moPresedanO[len(moPresedanO) - 1]
		debMoKour = moKouran[0]

		if finMoPres == 's' :
			liezon = 'z'
		elif finMoPres == 'x' :
			liezon = 'z'
		elif finMoPres == 'd' :
			liezon = 't'
		elif finMoPres == 'f' :
			liezon = 'v'
		else : 
			liezon = finMoPres


		if (finMoPres in terminezon) and (ord(debMoKour) in ordVouayel) and (moKouran not in moxHaspire) :
			if (katPresedan == 'SP1') or (katKouran == 'SP2') or (katPresedan == 'DET') or ((katPresedan == 'PRO') and (katKouran == 'VER')) or ((katPresedan == 'VER') and (katKouran == 'PRO')) or (moPresedan in verbEtre):
				if moPresedanO[len(moPresedanO) -1] in terminezon and not(moPresedanO[len(moPresedanO) -2:len(moPresedanO)] in nazal) and (not moPresedan in eksepFinalN):
					moPresedanO = moPresedanO[0:len(moPresedanO) - 1] + '-' + liezon
					#moPresedanO = moPresedanO + '-' + liezon
				else :
					moPresedanO = moPresedanO + '-' + liezon

	if moPresedanEmaj :
		if ord(moPresedanO[0]) in ordAksanMin : 
			debuMo = moPresedanO[0]
			ranOrdAksan = ordAksanMin.index(ord(debuMo))
			moPresedanO0Min = ordAksanMaj[ranOrdAksan]
			moPresedanO = moPresedanO.replace(debuMo, chr(moPresedanO0Min), 1)

		elif (ord(moPresedanO[0]) <= 122) and (ord(moPresedanO[0]) >= 97) :
			moPresedanO = moPresedanO.capitalize() 

	if  (moPresedanO != "") and (moPresedanO[len(moPresedanO) - 1] == 'x') :
		fichie_sibl.write(moPresedanO[0:len(moPresedanO) - 1])
	else:
		fichie_sibl.write(moPresedanO)

	fichie_sibl.write(sepPresedan)
	moPresedan = moKouran
	moPresedanEmaj = moKouranEmaj
	moPresedanO = moKouranO
	sepPresedan = sepKouran
	katPresedan = katKouran
	nbrPresedan = nbrKouran 
	sepKouran = ""
	moKouran = ""

if  (moPresedanO != "") and (moPresedanO[len(moPresedanO) - 1] == 'x') :
	fichie_sibl.write(moPresedanO[0:len(moPresedanO) - 1])
else:
	fichie_sibl.write(moPresedanO)

fichie_sibl.write(sepPresedan)
fichie_sours.close()
fichie_sibl.close()
fichie_diko.close()

