Repository URL to install this package:
|
Version:
0.0.7 ▾
|
import csv
from io import StringIO
from pathlib import Path
from t2wml.utils.utilities import VALID_PROPERTY_TYPES
import t2wml.utils.t2wml_exceptions as T2WMLExceptions
from t2wml.wikification.utility_functions import get_property_type
def enclose_in_quotes(value):
if value != "" and value is not None:
return "\""+str(value)+"\""
return ""
def kgtk_add_property_type_specific_fields(property_dict, result_dict):
property_type = get_property_type(property_dict["property"])
if property_type not in VALID_PROPERTY_TYPES:
raise T2WMLExceptions.UnsupportedPropertyType(
"Property type "+property_type+" is not currently supported" + "(" + property_dict["property"] + ")")
# The only property that doesn't require value
if property_type == "globecoordinate":
'''
node2;kgtk:latitude: for coordinates, the latitude
node2;kgtk:longitude: for coordinates, the longitude
'''
result_dict["node2;kgtk:data_type"] = "location_coordinates"
result_dict["node2;kgtk:latitude"] = property_dict["latitude"]
result_dict["node2;kgtk:longitude"] = property_dict["longitude"]
result_dict["node2;kgtk:precision"] = property_dict.get(
"precision", "")
result_dict["node2;kgtk:globe"] = property_dict.get("globe", "")
else:
value = property_dict["value"]
if property_type == "quantity":
'''
node2;kgtk:magnitude: for quantities, the number
node2;kgtk:units_node: for quantities, the unit
node2;kgtk:low_tolerance: for quantities, the lower bound of the value (cannot do it in T2WML yet)
node2;kgtk:high_tolerance: for quantities, the upper bound of the value (cannot do it in T2WML yet)
'''
result_dict["node2;kgtk:data_type"] = "quantity"
result_dict["node2;kgtk:number"] = value
result_dict["node2;kgtk:units_node"] = property_dict.get(
"unit", "")
result_dict["node2;kgtk:low_tolerance"] = property_dict.get(
"lower-bound", "")
result_dict["node2;kgtk:high_tolerance"] = property_dict.get(
"upper-bound", "")
elif property_type == "time":
'''
node2;kgtk:date_and_time: for dates, the ISO-formatted data
node2;kgtk:precision: for dates, the precision, as an integer (need to verify this with KGTK folks, could be that we use human readable strings such as year, month
node2;kgtk:calendar: for dates, the qnode of the calendar, if specified
'''
result_dict["node2;kgtk:data_type"] = "date_and_times"
result_dict["node2;kgtk:date_and_time"] = enclose_in_quotes(value)
result_dict["node2;kgtk:precision"] = property_dict.get(
"precision", "")
result_dict["node2;kgtk:calendar"] = property_dict.get(
"calendar", "")
elif property_type in ["string", "monolingualtext", "externalidentifier", "url"]:
'''
node2;kgtk:text: for text, the text without the language tag
node2;kgtk:language: for text, the language tag
'''
result_dict["node2;kgtk:data_type"] = "string"
result_dict["node2;kgtk:text"] = enclose_in_quotes(value)
result_dict["node2;kgtk:language"] = enclose_in_quotes(
property_dict.get("lang", ""))
elif property_type in ["wikibaseitem", "wikibaseproperty"]:
'''
node2;kgtk:symbol: when node2 is another item, the item goes here"
'''
result_dict["node2;kgtk:data_type"] = "symbol"
result_dict["node2;kgtk:symbol"] = value
def create_kgtk(data, file_path, sheet_name):
file_name = Path(file_path).name
file_extension = Path(file_path).suffix
if file_extension == ".csv":
sheet_name = ""
else:
sheet_name = "."+sheet_name
tsv_data = []
for cell in data:
try:
statement = data[cell]
id = file_name + sheet_name + ";" + cell
cell_result_dict = dict(
id=id, node1=statement["item"], label=statement["property"])
kgtk_add_property_type_specific_fields(statement, cell_result_dict)
tsv_data.append(cell_result_dict)
qualifiers = statement.get("qualifier", [])
for qualifier in qualifiers:
# commented out. for now, do not generate an id at all for qualifier edges.
#second_cell=qualifier.get("cell", "")
#q_id = file_name + "." + sheet_name + ";" + cell +";"+second_cell
qualifier_result_dict = dict(
node1=id, label=qualifier["property"])
kgtk_add_property_type_specific_fields(
qualifier, qualifier_result_dict)
tsv_data.append(qualifier_result_dict)
references = statement.get("reference", [])
# todo: handle references
except Exception as e:
raise(e)
string_stream = StringIO("", newline="")
fieldnames = ["id", "node1", "label", "node2", "node2;kgtk:data_type",
"node2;kgtk:number", "node2;kgtk:low_tolerance", "node2;kgtk:high_tolerance", "node2;kgtk:units_node",
"node2;kgtk:date_and_time", "node2;kgtk:precision", "node2;kgtk:calendar",
"node2;kgtk:truth",
"node2;kgtk:symbol",
"node2;kgtk:latitude", "node2;kgtk:longitude", "node2;kgtk:globe",
"node2;kgtk:text", "node2;kgtk:language", ]
writer = csv.DictWriter(string_stream, fieldnames,
restval="", delimiter="\t", lineterminator="\n",
escapechar='', quotechar='',
dialect=csv.unix_dialect, quoting=csv.QUOTE_NONE)
writer.writeheader()
for entry in tsv_data:
writer.writerow(entry)
data = string_stream.getvalue()
string_stream.close()
return data