forked from Azure/gpt-rag-ingestion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunction_app.py
125 lines (113 loc) · 4.68 KB
/
function_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import azure.functions as func
app = func.FunctionApp()
from json import JSONEncoder
class DateTimeEncoder(JSONEncoder):
#Override the default method
def default(self, obj):
import datetime
if isinstance(obj, (datetime.date, datetime.datetime)):
return obj.isoformat()
@app.route(route="document-chunking", auth_level=func.AuthLevel.FUNCTION)
def document_chunking(req: func.HttpRequest) -> func.HttpResponse:
import jsonschema
import logging
logging.info('Invoked document_chunking skill.')
try:
body = req.get_json()
logging.debug(f'REQUEST BODY: {body}')
jsonschema.validate(body, schema=get_request_schema())
if body:
result = process_documents(body)
logging.info('Finished document_chunking skill.')
return func.HttpResponse(result, mimetype="application/json")
else:
error_message = "Invalid body."
logging.error(error_message)
return func.HttpResponse(error_message, status_code=400)
except ValueError as e:
error_message = "Invalid body: {0}".format(e)
logging.error(error_message)
return func.HttpResponse(error_message, status_code=400)
except jsonschema.exceptions.ValidationError as e:
error_message = "Invalid request: {0}".format(e)
logging.error(error_message)
return func.HttpResponse(error_message, status_code=400)
def process_documents(body):
import json
import logging
from chunker.chunk_documents_formrec import chunk_document
values = body['values']
results = {}
results["values"] = []
for value in values:
# perform operation on each record (document)
data = value['data']
logging.info(f"Chunking {data['documentUrl'].split('/')[-1]}.")
chunks, errors, warnings = chunk_document(data)
# errors = []
# warnings = []
# chunks = [{
# "filepath": '123',
# "chunk_id": 0,
# "offset": 0,
# "length": 0,
# "page": 1,
# "title": "default",
# "category": "default",
# "url": '123',
# "content": data['documentUrl'],
# "contentVector": [0.1] * 1536,
# },
# {
# "filepath": '123',
# "chunk_id": 2,
# "offset": 0,
# "length": 0,
# "page": 1,
# "title": "default",
# "category": "default",
# "url": '123',
# "content": data['documentUrl'],
# "contentVector": [0.1] * 1536,
# }]
output_record = {
"recordId": value['recordId'],
"data": {
"chunks": chunks
},
"errors": errors,
"warnings": warnings
}
if output_record != None:
results["values"].append(output_record)
logging.info('Finished process_documents.')
return json.dumps(results, ensure_ascii=False, cls=DateTimeEncoder)
def get_request_schema():
return {
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"properties": {
"values": {
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"properties": {
"recordId": {"type": "string"},
"data": {
"type": "object",
"properties": {
"documentUrl": {"type": "string", "minLength": 1},
"documentContent": {"type": "string"},
"documentSasToken": {"type": "string", "minLength": 1},
"documentContentType": {"type": "string", "minLength": 1}
},
"required": ["documentContent", "documentUrl", "documentSasToken", "documentContentType"],
},
},
"required": ["recordId", "data"],
},
}
},
"required": ["values"],
}