Source code for sasctl._services.text_parsing

#!/usr/bin/env python
# encoding: utf-8
#
# Copyright © 2019, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0


from sasctl.core import current_session, uri_as_str

from .service import Service


[docs] class TextParsing(Service): """The Text Parsing API parses natural language text documents. Parsing is a key operation in understanding your data. Parsing a document involves the following analyses: - Identifying terms used in the document - Recognizing parts of speech for each term - Identifying which terms are entities (person, country, and so on) - Resolving synonyms, misspellings, and so on The output tables that are generated during parsing can also be used in downstream analyses such as topic generation. """ _SERVICE_ROOT = "/parsing"
[docs] @classmethod def parse_documents( cls, documents, caslib=None, id_column=None, text_column=None, description=None, standard_entities=False, noun_groups=False, min_doc_count=10, concept_model=None, output_postfix=None, spell_check=False, override_list=None, stop_list=None, start_list=None, synonym_list=None, language="en", ): """Performs natural language parsing on the input data. Creates a text parsing job that executes asynchronously. There are two different interactions for parsing: parsing documents in CAS tables and parsing documents that are uploaded directly. Parameters ---------- documents : str or dict or Iterable Documents to parse. May be either the URI to a CAS table where the documents are currently stored, or an iterable of strings containing the documents' text. caslib : str or dict, optional URI of a caslib in which the documents will be stored. Required if `documents` is a list of strings. id_column : str, optional The column in `documents` that contains a unique id for each document. Required if `documents` is a CAS table URI. text_column : str, optional The column in `documents` that contains the document text to parse. Required if `documents` is a CAS table URI. description : str, optional Description to add to the text parsing job. standard_entities : bool, optional noun_groups : bool, optional min_doc_count : int, optional Minimum number of documents in which a term must appear to be kept. Defaults to 10. output_postfix : str, optional Text to be added to the end of all output table names. spell_check : bool, optional Whether spell checking should be performed during parsing. concept_model : str or dict, optional URI of a table containing the concept LITI binaries to apply during parsing. override_list : str or dict, optional URI of a table containing overrides for the keep and drop terms. language : str, optional Two letter `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639>`_ code indicating the source language. Defaults to 'en'. Returns ------- RestObj The submitted job See Also -------- .cas_management.CASManagement.get_caslib .cas_management.CASManagement.get_table """ if current_session().version_info() >= 4: raise RuntimeError("The Text Parsing service was removed from Viya 4.") if documents is None: raise TypeError("`documents` cannot be None.") if isinstance(documents, (dict, str)): data = { "inputUri": uri_as_str(documents), "documentIdVariable": id_column, "textVariable": text_column, "version": 1, } else: data = { "caslibUri": uri_as_str(caslib), "documents": documents, "version": 1, } data.update( { "description": description, "language": language, "includeStandardEntities": standard_entities, "includeNounGroups": noun_groups, "startListUri": uri_as_str(start_list), "stopListUri": uri_as_str(stop_list), "synonymListUri": uri_as_str(synonym_list), "minimumDocumentCount": min_doc_count, "conceptModelUri": uri_as_str(concept_model), "outputTableNamePostfix": output_postfix, "enableSpellChecking": spell_check, "overrideListUri": uri_as_str(override_list), } ) # Optional fields are not ignored if None so explicitly remove before # sending. for k in list(data.keys()): if data[k] is None: del data[k] url = "/jobs" # Update URL if passing in raw documents. if "documents" in data: url += "#data" headers = { "Content-Type": "application/vnd.sas.text.parsing.job.request.documents+json", "Accept": "application/vnd.sas.text.parsing.job+json", } else: headers = { "Content-Type": "application/vnd.sas.text.parsing.job.request+json", "Accept": "application/vnd.sas.text.parsing.job+json", } return cls.post(url, json=data, headers=headers)