Repository URL to install this package:
Version:
3.0.0 ▾
|
{ Parser of the CLDR collation xml files.
Copyright (c) 2013 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}
unit cldrxml;
{$mode objfpc}{$H+}
{$TypedAddress on}
interface
uses
Classes, SysUtils, DOM,
cldrhelper;
procedure ParseInitialDocument(ASequence : POrderedCharacters; ADoc : TDOMDocument);overload;
procedure ParseInitialDocument(ASequence : POrderedCharacters; AFileName : string);overload;
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);overload;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);overload;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollationItem;
AType : string
);overload;
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollationItem;
AType : string
);overload;
resourcestring
sCaseNothandled = 'This case is not handled : "%s", Position = %d.';
sCodePointExpected = 'Code Point node expected as child at this position "%d".';
sCollationsNodeNotFound = '"collations" node not found.';
sCollationTypeNotFound = 'collation "Type" not found : "%s".';
sHexAttributeExpected = '"hex" attribute expected at this position "%d".';
sInvalidResetClause = 'Invalid "Reset" clause.';
sNodeNameAssertMessage = 'Expected NodeName "%s", got "%s".';
sRulesNodeNotFound = '"rules" node not found.';
sTextNodeChildExpected = '(Child) text node expected at this position "%d", but got "%s".';
sUniqueChildNodeExpected = 'Unique child node expected at this position "%d".';
sUnknownResetLogicalPosition = 'Unknown reset logical position : "%s".';
implementation
uses
typinfo, XMLRead, XPath, Helper, unicodeset;
const
s_AT = 'at';
//s_BEFORE = 'before';
s_CODEPOINT = 'codepoint';
s_COLLATION = 'collation';
s_COLLATIONS = 'collations';
s_CONTEXT = 'context';
//s_DEFAULT = 'default';
s_EXTEND = 'extend';
s_HEX = 'hex';
s_POSITION = 'position';
s_RESET = 'reset';
s_RULES = 'rules';
//s_STANDART = 'standard';
s_TYPE = 'type';
procedure CheckNodeName(ANode : TDOMNode; const AExpectedName : DOMString);
begin
if (ANode.NodeName <> AExpectedName) then
raise Exception.CreateFmt(sNodeNameAssertMessage,[AExpectedName,ANode.NodeName]);
end;
function CharToReorderWeigthKind(const AChar : Char) : TReorderWeigthKind;inline;
begin
case AChar of
'p' : Result := TReorderWeigthKind.PriMary;
's' : Result := TReorderWeigthKind.Secondary;
't' : Result := TReorderWeigthKind.Tertiary;
'i' : Result := TReorderWeigthKind.Identity;
else
Result := TReorderWeigthKind.Identity;
end;
end;
function DomString2UnicodeCodePointArray(const AValue : DOMString): TUnicodeCodePointArray;
var
u4str : UCS4String;
k : Integer;
begin
if (Length(AValue) = 0) then
exit(nil);
if (Length(AValue) = 1) then begin
SetLength(Result,1);
Result[0] := Ord(AValue[1])
end else begin
u4str := WideStringToUCS4String(AValue);
k := Length(u4str) - 1; // remove the last #0
SetLength(Result,k);
for k := 0 to k - 1 do
Result[k] := u4str[k];
end;
end;
function TryStrToLogicalReorder(
const AValue : string;
out AResult : TReorderLogicalReset
) : Boolean;
var
s : string;
i : Integer;
begin
s := StringReplace(AValue,' ','',[rfReplaceAll]);
s := StringReplace(s,'_','',[rfReplaceAll]);
i := GetEnumValue(TypeInfo(TReorderLogicalReset),s);
Result := (i > -1);
if Result then
AResult := TReorderLogicalReset(i);
end;
function ParseStatement(
ARules : TDOMElement;
AStartPosition : Integer;
AStatement : PReorderSequence;
var ANextPos : Integer
) : Boolean;
var
startPosition : Integer;
statement : PReorderSequence;
elementActualCount : Integer;
list : TDOMNodeList;
inBlock : Boolean;
procedure SkipComments();
begin
while (startPosition < list.Count) do begin
if (list[startPosition].NodeType <> COMMENT_NODE) then
Break;
Inc(startPosition);
end;
end;
function parse_reset() : Integer;
var
n, t : TDOMNode;
s : string;
logicalPos : TReorderLogicalReset;
begin
SkipComments();
n := list[startPosition];
CheckNodeName(n,s_RESET);
if n.HasChildNodes() then begin
n := n.FirstChild;
if (n.NodeType = TEXT_NODE) then begin
statement^.Reset := DomString2UnicodeCodePointArray(Trim(TDOMText(n).Data));
Result := startPosition+1;
end else begin
if not TryStrToLogicalReorder(n.NodeName,logicalPos) then
raise Exception.CreateFmt(sUnknownResetLogicalPosition,[n.NodeName]);
statement^.LogicalPosition := logicalPos;
Result := startPosition+1;
end;
end else if not n.HasChildNodes() then begin
if (list[startPosition+1].NodeName = s_POSITION) then begin
s := list[startPosition+1].Attributes.GetNamedItem(s_AT).NodeValue;
if not TryStrToLogicalReorder(s,logicalPos) then
raise Exception.CreateFmt(sUnknownResetLogicalPosition,[s]);
statement^.LogicalPosition := logicalPos;
Result := startPosition+2;
end else begin
t := list[startPosition+1];
{if (t.NodeType <> TEXT_NODE) then
raise Exception.CreateFmt(sTextNodeChildExpected,[(startPosition+1),(t.NodeName+'('+t.ClassName+')')]);}
if (t.NodeType = TEXT_NODE) then
statement^.Reset := DomString2UnicodeCodePointArray(Trim(TDOMText(t).Data))
else
statement^.Reset := DomString2UnicodeCodePointArray(' ');
Result := startPosition+2;
end;
end;
if (statement^.LogicalPosition = TReorderLogicalReset.None) and
(Length(statement^.Reset) = 0)
then
raise Exception.Create(sInvalidResetClause);
end;
procedure EnsureElementLength(const ALength : Integer);
var
k, d : Integer;
begin
k := Length(statement^.Elements);
if (k < ALength) then begin
k := ALength;
if (k = 0) then begin
k := 50;
end else begin
if (k < 10) then
d := 10
else
d := 2;
k := k * d;
end;
SetLength(statement^.Elements,k);
end;
end;
procedure AddElement(
const AChars : array of UCS4Char;
const AWeigthKind : TReorderWeigthKind;
const AContext : DOMString
);overload;
var
kp : PReorderUnit;
k : Integer;
begin
EnsureElementLength(elementActualCount+1);
kp := @statement^.Elements[elementActualCount];
SetLength(kp^.Characters,Length(AChars));
for k := 0 to Length(AChars) - 1 do
kp^.Characters[k] := AChars[k];
kp^.WeigthKind := AWeigthKind;
elementActualCount := elementActualCount + 1;
if (AContext <> '') then
kp^.Context := DomString2UnicodeCodePointArray(AContext);
end;
procedure ReadChars(
ANode : TDOMNode;
APos : Integer;
var AChars : UCS4String
);
var
t : TDOMNode;
u4str : UCS4String;
s : DOMString;
begin
if not ANode.HasChildNodes() then begin
SetLength(AChars,1);
AChars[0] := Ord(UnicodeChar(' '));
exit;
//raise Exception.CreateFmt(sCodePointExpected + ANode.ClassName,[APos]);
end;
t := ANode.FindNode(s_CODEPOINT);
if (t = nil) then begin
if (ANode.ChildNodes.Count <> 1) then
raise Exception.CreateFmt(sUniqueChildNodeExpected,[APos]);
t := ANode.ChildNodes[0];
if not t.InheritsFrom(TDOMText) then
raise Exception.CreateFmt(sTextNodeChildExpected,[APos,(t.NodeName+'('+t.ClassName+')')]);
s := TDOMText(t).Data;
if (Length(s) = 1) then begin
SetLength(AChars,1);
AChars[0] := Ord(s[1]);
end else begin
u4str := WideStringToUCS4String(s);
AChars := u4str;
SetLength(AChars,Length(AChars)-1);
end;
end else begin
t := t.Attributes.GetNamedItem(s_HEX);
if (t = nil) then
raise Exception.CreateFmt(sHexAttributeExpected,[APos]);
SetLength(AChars,1);
AChars[0] := StrToInt('$'+t.NodeValue);
end
end;
procedure AddPrefixChars(const APrefix : array of UCS4Char; var ADest : TUnicodeCodePointArray);
var
k : Integer;
begin
k := Length(ADest);
SetLength(ADest,(k+Length(APrefix)));
Move(ADest[0],ADest[k+1],(SizeOf(k*ADest[0])));
for k := 0 to k - 1 do
ADest[k] := APrefix[k];
end;
function ReadNextItem(const APos : Integer) : Integer;
var
n, t : TDOMNode;
contextStr : DOMString;
w : TReorderWeigthKind;
isSimpleCharTag : Boolean;
simpleCharTag : AnsiChar;
last : PReorderUnit;
u4str : UCS4String;
k : Integer;
begin
contextStr := '';
Result := APos;
n := list[APos];
isSimpleCharTag := (Length(n.NodeName) = 1) and (Ord(n.NodeName[1])<=127);
if isSimpleCharTag then begin
simpleCharTag := AnsiChar(n.NodeName[1]);
if (simpleCharTag = 'x') then begin
inBlock := True;
n := n.FirstChild;
if (n.NodeName = s_CONTEXT) then begin
if n.HasChildNodes() then begin
t := n.FirstChild;
if (t.NodeType = TEXT_NODE) then
contextStr := TDOMText(t).Data;
end;
n := n.NextSibling;
end;
isSimpleCharTag := (Length(n.NodeName) = 1) and (Ord(n.NodeName[1])<=127);
if isSimpleCharTag then
simpleCharTag := AnsiChar(n.NodeName[1]);
end;
end;
if isSimpleCharTag and (simpleCharTag in ['p','s','t','i']) then begin
w := CharToReorderWeigthKind(AnsiChar(n.NodeName[1]));
ReadChars(n,APos,u4str);
AddElement(u4str,w,contextStr);
Result := Result + 1;
if not inBlock then
exit;
last := @statement^.Elements[elementActualCount-1];
n := n.NextSibling;
if (n <> nil) and (n.NodeName = s_EXTEND) then begin
ReadChars(n,APos,u4str);
SetLength(last^.ExpansionChars,Length(u4str));
for k := 0 to Length(u4str) - 1 do
last^.ExpansionChars[k] := u4str[k];
end;
exit;
end;
if (Length(n.NodeName) = 2) and (n.NodeName[2] = 'c') and
(Ord(n.NodeName[1])<=127) and (AnsiChar(n.NodeName[1]) in ['p','s','t','i'])
then begin
w := CharToReorderWeigthKind(AnsiChar(n.NodeName[1]));
ReadChars(n,APos,u4str);
for k := Low(u4str) to High(u4str) do
AddElement(u4str[k],w,contextStr);
Result := Result + 1;
exit;
end;
raise Exception.CreateFmt(sCaseNothandled,[n.NodeName,APos]);
end;
var
i, c : Integer;
n : TDOMNode;
begin
Result := False;
inBlock := False;
elementActualCount := 0;
if (AStartPosition <= 0) then
startPosition := 0
else
startPosition := AStartPosition;
i := startPosition;
list := ARules.ChildNodes;
c := list.Count;
if (c <= i) then
exit;
statement := AStatement;
statement^.Clear();
n := list[i];
i := parse_reset();
while (i < c) do begin
n := list[i];
if (n.NodeName = s_RESET) then
Break;
i := ReadNextItem(i);
end;
SetLength(statement^.Elements,elementActualCount);
Result := (i > startPosition);
if Result then
ANextPos := i;
end;
procedure ParseInitialDocument(ASequence : POrderedCharacters; ADoc : TDOMDocument);
var
n : TDOMNode;
rulesElement : TDOMElement;
i, c, nextPost : Integer;
statement : TReorderSequence;
p : PReorderUnit;
begin
n := ADoc.DocumentElement.FindNode(s_RULES);
if (n = nil) then
raise Exception.Create(sRulesNodeNotFound);
rulesElement := n as TDOMElement;
c := rulesElement.ChildNodes.Count;
ASequence^.Clear();
SetLength(ASequence^.Data,c+100);
nextPost := 0;
i := 0;
while (i < c) do begin
statement.Clear();
if not ParseStatement(rulesElement,i,@statement,nextPost) then
Break;
i := nextPost;
try
ASequence^.ApplyStatement(@statement);
except
on e : Exception do begin
e.Message := Format('%s Position = %d',[e.Message,i]);
raise;
end;
end;
end;
if (ASequence^.ActualLength > 0) then begin
p := @ASequence^.Data[0];
for i := 0 to ASequence^.ActualLength - 1 do begin
p^.Changed := False;
Inc(p);
end;
end;
end;
procedure ParseInitialDocument(ASequence : POrderedCharacters; AFileName : string);
var
doc : TXMLDocument;
begin
ReadXMLFile(doc,AFileName);
try
ParseInitialDocument(ASequence,doc);
finally
doc.Free();
end;
end;
function EvaluateXPathStr(const AExpression : string; AContextNode : TDOMNode): DOMString;
var
xv : TXPathVariable;
begin
xv := EvaluateXPathExpression(AExpression,AContextNode);
try
if (xv <> nil) then
Result := xv.AsText
else
Result := '';
finally
xv.Free();
end;
end;
function ParseDeletion(
const APattern : DOMString;
ASequence : PReorderSequence
) : Integer;
var
r : array of TReorderUnit;
c, i : Integer;
uset : TUnicodeSet;
it : TUnicodeSet.TIterator;
p : PReorderUnit;
begin
if (APattern = '') then
exit(0);
it := nil;
uset := TUnicodeSet.Create();
try
uset.AddPattern(APattern);
it := uset.CreateIterator();
c := 0;
it.Reset();
while it.MoveNext() do begin
Inc(c);
end;
SetLength(r,c);
p := @r[0];
i := 0;
it.Reset();
while it.MoveNext() do begin
p^.Clear();
p^.WeigthKind := TReorderWeigthKind.Deletion;
p^.Characters := Copy(it.GetCurrent());
Inc(p);
Inc(i);
end;
ASequence^.Clear();
ASequence^.Elements := r;
finally
it.Free();
uset.Free();
end;
SetLength(r,0);
end;
procedure ParseCollationItem(
ACollationNode : TDOMElement;
AItem : TCldrCollationItem;
AMode : TCldrParserMode
);
var
n : TDOMNode;
rulesElement : TDOMElement;
i, c, nextPos : Integer;
statementList : TReorderSequenceArray;
sal : Integer;//statement actual length
statement : PReorderSequence;
s : DOMString;
begin
AItem.TypeName := ACollationNode.GetAttribute(s_TYPE);
AItem.Base := EvaluateXPathStr('base',ACollationNode);
AItem.Backwards := (EvaluateXPathStr('settings/@backwards',ACollationNode) = 'on');
if AItem.Backwards then
AItem.ChangedFields := AItem.ChangedFields + [TCollationField.BackWard];
AItem.Rules := nil;
if (AMode = TCldrParserMode.FullParsing) then begin
SetLength(statementList,15);
sal := 0;
statement := @statementList[0];
s := EvaluateXPathStr('suppress_contractions',ACollationNode);
if (s <> '') then begin
if (ParseDeletion(s,statement) > 0) then begin
Inc(sal);
Inc(statement);
end else begin
statement^.Clear();
end;
end;
n := ACollationNode.FindNode(s_RULES);
if (n <> nil) then begin
rulesElement := n as TDOMElement;
c := rulesElement.ChildNodes.Count;
nextPos := 0;
i := 0;
while (i < c) do begin
statement^.Clear();
if not ParseStatement(rulesElement,i,statement,nextPos) then
Break;
i := nextPos;
Inc(statement);
Inc(sal);
if (sal >= Length(statementList)) then begin
SetLength(statementList,(sal*2));
statement := @statementList[(sal-1)];
end;
end;
end;
SetLength(statementList,sal);
AItem.Rules := statementList;
end;
end;
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);
var
n : TDOMNode;
collationsElement : TDOMElement;
i, c : Integer;
item : TCldrCollationItem;
nl : TDOMNodeList;
begin
n := ADoc.DocumentElement.FindNode(s_COLLATIONS);
if (n = nil) then
raise Exception.Create(sCollationsNodeNotFound);
collationsElement := n as TDOMElement;
ACollation.Clear();
ACollation.Language := EvaluateXPathStr('identity/language/@type',ADoc.DocumentElement);
ACollation.Version := EvaluateXPathStr('identity/version/@number',ADoc.DocumentElement);
ACollation.DefaultType := EvaluateXPathStr('collations/default/@type',ADoc.DocumentElement);
if collationsElement.HasChildNodes() then begin
nl := collationsElement.ChildNodes;
c := nl.Count;
item := nil;
try
for i := 0 to c - 1 do begin
n := nl[i];
if (n.NodeName = s_COLLATION) then begin
item := TCldrCollationItem.Create();
ParseCollationItem((n as TDOMElement),item,AMode);
ACollation.Add(item);
item := nil;
end
end;
except
FreeAndNil(item);
raise;
end;
end;
end;
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollationItem;
AType : string
);
var
xv : TXPathVariable;
begin
xv := EvaluateXPathExpression(Format('collations/collation[@type=%s]',[QuotedStr(AType)]),ADoc.DocumentElement);
try
if (xv.AsNodeSet.Count = 0) then
raise Exception.CreateFmt(sCollationTypeNotFound,[AType]);
ACollation.Clear();
ParseCollationItem((TDOMNode(xv.AsNodeSet[0]) as TDOMElement),ACollation,TCldrParserMode.FullParsing);
finally
xv.Free();
end
end;
function ReadXMLFile(f: TStream) : TXMLDocument;
var
src : TXMLInputSource;
parser: TDOMParser;
begin
src := TXMLInputSource.Create(f);
Result := TXMLDocument.Create;
parser := TDOMParser.Create();
try
parser.Options.IgnoreComments := True;
parser.Parse(src, Result);
finally
src.Free();
parser.Free;
end;
end;
function ReadXMLFile(const AFilename: String) : TXMLDocument;
var
FileStream: TStream;
begin
Result := nil;
FileStream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
try
Result := ReadXMLFile(FileStream);
finally
FileStream.Free;
end;
end;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);
var
doc : TXMLDocument;
begin
doc := ReadXMLFile(AFileName);
try
ParseCollationDocument(doc,ACollation,AMode);
ACollation.LocalID := ExtractFileName(ChangeFileExt(AFileName,''));
finally
doc.Free();
end;
end;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollationItem;
AType : string
);
var
doc : TXMLDocument;
begin
doc := ReadXMLFile(AFileName);
try
ParseCollationDocument(doc,ACollation,AType);
finally
doc.Free();
end;
end;
end.