#!/usr/bin/env python
# encoding: utf-8
#
# Copyright © 2019, SAS Institute Inc., Cary, NC, USA. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
from sasctl.core import current_session, uri_as_str
from .service import Service
[docs]
class TextParsing(Service):
"""The Text Parsing API parses natural language text documents.
Parsing is a key operation in understanding your data. Parsing a document
involves the following analyses:
- Identifying terms used in the document
- Recognizing parts of speech for each term
- Identifying which terms are entities (person, country, and so on)
- Resolving synonyms, misspellings, and so on
The output tables that are generated during parsing can also be used in
downstream analyses such as topic generation.
"""
_SERVICE_ROOT = "/parsing"
[docs]
@classmethod
def parse_documents(
cls,
documents,
caslib=None,
id_column=None,
text_column=None,
description=None,
standard_entities=False,
noun_groups=False,
min_doc_count=10,
concept_model=None,
output_postfix=None,
spell_check=False,
override_list=None,
stop_list=None,
start_list=None,
synonym_list=None,
language="en",
):
"""Performs natural language parsing on the input data.
Creates a text parsing job that executes asynchronously. There are two
different interactions for parsing: parsing documents in CAS tables and
parsing documents that are uploaded directly.
Parameters
----------
documents : str or dict or Iterable
Documents to parse. May be either the URI to a CAS table where the
documents are currently stored, or an iterable of strings containing
the documents' text.
caslib : str or dict, optional
URI of a caslib in which the documents will be stored. Required if
`documents` is a list of strings.
id_column : str, optional
The column in `documents` that contains a unique id for each
document. Required if `documents` is a CAS table URI.
text_column : str, optional
The column in `documents` that contains the document text to parse.
Required if `documents` is a CAS table URI.
description : str, optional
Description to add to the text parsing job.
standard_entities : bool, optional
noun_groups : bool, optional
min_doc_count : int, optional
Minimum number of documents in which a term must appear to be kept.
Defaults to 10.
output_postfix : str, optional
Text to be added to the end of all output table names.
spell_check : bool, optional
Whether spell checking should be performed during parsing.
concept_model : str or dict, optional
URI of a table containing the concept LITI binaries to apply during
parsing.
override_list : str or dict, optional
URI of a table containing overrides for the keep and drop terms.
language : str, optional
Two letter
`ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639>`_
code indicating the source language. Defaults to 'en'.
Returns
-------
RestObj
The submitted job
See Also
--------
.cas_management.CASManagement.get_caslib
.cas_management.CASManagement.get_table
"""
if current_session().version_info() >= 4:
raise RuntimeError("The Text Parsing service was removed from Viya 4.")
if documents is None:
raise TypeError("`documents` cannot be None.")
if isinstance(documents, (dict, str)):
data = {
"inputUri": uri_as_str(documents),
"documentIdVariable": id_column,
"textVariable": text_column,
"version": 1,
}
else:
data = {
"caslibUri": uri_as_str(caslib),
"documents": documents,
"version": 1,
}
data.update(
{
"description": description,
"language": language,
"includeStandardEntities": standard_entities,
"includeNounGroups": noun_groups,
"startListUri": uri_as_str(start_list),
"stopListUri": uri_as_str(stop_list),
"synonymListUri": uri_as_str(synonym_list),
"minimumDocumentCount": min_doc_count,
"conceptModelUri": uri_as_str(concept_model),
"outputTableNamePostfix": output_postfix,
"enableSpellChecking": spell_check,
"overrideListUri": uri_as_str(override_list),
}
)
# Optional fields are not ignored if None so explicitly remove before
# sending.
for k in list(data.keys()):
if data[k] is None:
del data[k]
url = "/jobs"
# Update URL if passing in raw documents.
if "documents" in data:
url += "#data"
headers = {
"Content-Type": "application/vnd.sas.text.parsing.job.request.documents+json",
"Accept": "application/vnd.sas.text.parsing.job+json",
}
else:
headers = {
"Content-Type": "application/vnd.sas.text.parsing.job.request+json",
"Accept": "application/vnd.sas.text.parsing.job+json",
}
return cls.post(url, json=data, headers=headers)