framework/resources/Functional/odbcTest.py

#!/usr/bin/python

import pyodbc
import datetime
import os
import sys
import fnmatch
import json
import re
import glob
import subprocess
import decimal
from types import *

# Java SQL data types
ARRAY = 2003
BIGINT = -5
BINARY = -2
BIT = -7
BLOB = 2004
BOOLEAN = 16
CHAR = 1
CLOB = 2005
DATALINK = 70
DATE = 91
DECIMAL = 3
DISTINCT = 2001
DOUBLE = 8
FLOAT = 6
INTEGER = 4
JAVA_OBJECT = 2000
LONGNVARCHAR = -16
LONGVARBINARY = -4
LONGVARCHAR = -1
NCHAR = -15
NCLOB = 2011
NULL = 0
NUMERIC = 2
NVARCHAR = -9
OTHER = 1111
REAL = 7
REF = 2006
ROWID = -8
SMALLINT = 5
SQLXML = 2009
STRUCT = 2002
TIME = 92
TIMESTAMP = 93
TINYINT = -6
VARBINARY = -3
VARCHAR = 12
NONETYPE = -100

def ParseJSONFile(json_file):
    with open(json_file, 'r') as json_data:
       parse_json = json.load(json_data)

    # extract query file extension from JSON file
    queryFile = parse_json["matrices"][0]["query-file"]
    queryFileRegex = re.compile("\.\*(.*)")
    queryFileRegexSearch = queryFileRegex.search(queryFile)
    queryFileExt = ""
    if queryFileRegexSearch:
       queryFileExt = queryFileRegexSearch.group(1)

    # extract expected file extension from JSON file
    expectedFile = parse_json["matrices"][0]["expected-file"]
    expectedFileRegex = re.compile("\.\*(.*)")
    expectedFileRegexSearch = expectedFileRegex.search(expectedFile)
    expectedFileExt = ""
    if expectedFileRegexSearch:
       expectedFileExt = expectedFileRegexSearch.group(1)

    # extract schema from JSON file.  convert from unicode
    # to utf-8
    schema_string = parse_json["matrices"][0]["schema"]
    schema = schema_string.encode('utf-8')

    try:
       outputFile = parse_json["matrices"][0]["output-file"]
       outputFileRegex = re.compile("\.\*(.*)")
       outputFileExt = outputFileRegex.search(outputFile).group(1)
    except KeyError:
       outputFileExt = ".o"

    return (queryFile, queryFileExt, expectedFile, expectedFileExt, schema,
            outputFileExt)

def SaveData (rows, queryDir, queryFileName, outputFileExt, resultFileCreated):
  # create "type" file that contains the data types of each column
  typeFile = "%s/%s%s" % (queryDir, queryFileName, ".type")
  typeFile = open (typeFile, 'w')
  # creat "label" file that contains the column names
  columnFile = "%s/%s%s" % (queryDir, queryFileName, ".label")
  columnFile = open (columnFile, 'w')
  # create some dummy classes for "special" data types
  newdecimal = type(decimal.Decimal(0))
  newdate = type(datetime.date(1,2,3))
  newtimestamp = type(datetime.datetime(1,2,3))
  newtime = type(datetime.time(1,2,3))

  # determine the column names and data types returned by this query
  # and store in "type" file and "label" file
  dataType = []
  for column in cursor.description:
    getTypeArray = {
       LongType: BIGINT,
       BooleanType: BOOLEAN,
       newdate: DATE,
       newdecimal: DECIMAL,
       FloatType: FLOAT,
       IntType: INTEGER,
       newtime: TIME,
       newtimestamp: TIMESTAMP,
       BufferType: VARBINARY,
       StringType: VARCHAR
    }
    getType = getTypeArray.get(column[1], NONETYPE)
    if getType == NONETYPE:
      if column[1] == pyodbc.DATETIME:
        getType = TIMESTAMP
      else:
        sys.exit()
    dataType.append(getType);
    typeFile.write("%s\n" % getType);
    columnFile.write("%s\n" % column[0]);
  typeFile.close()
  columnFile.close()

  # open output file.  by default, the output file is created in the same
  # directory as the query file.  It can also be created in a separate
  # directory.
  if outputFileGiven:
    outputFile = outputFileArg;
  elif resultFileCreated:
    outputFile = "%s/%s%s" % (resultDir, queryFileName, outputFileExt)
  else:
    outputFile = "%s/%s%s" % (queryDir, queryFileName, outputFileExt)
  outputFileH = open (outputFile, 'w')

  # evaluate each row that is returned
  for row in rows:
    # add null values by converting None values to null values
    for index, value in enumerate(row):
       if value == None:
         row[index] = "null"

    # Convert tuple containing row fields into a single string delimited by tabs.
    # For some data types, Simba ODBC driver returns data that is not completely
    # the same as what the Drill Apache driver returns.  Modify the data
    # from Simba to match what Drill Apache provides, so that the data matches
    # the contents of the expected results files.
    convertToString = ""
    for index, field in enumerate(row):
      fieldString = str(field)
      if dataType[index] == BOOLEAN:
        if fieldString == "True":
          field = "true"
        elif fieldString == "False":
          field = "false"
      if dataType[index] == TIME and fieldString != "null":
        # strip off microseconds.  Drill only has milliseconds
        if field.microsecond > 0:
          fieldString = fieldString.rstrip('0123456789')
          fieldString = "%s%s" % (fieldString,field.microsecond)
        field = fieldString
      if dataType[index] == TIMESTAMP and fieldString != "null":
        # strip off microseconds.  Drill only has milliseconds
        try:
          data = datetime.datetime.strptime(fieldString, "%Y-%m-%d %H:%M:%S.%f")
          if data.microsecond > 0:
            fieldString = fieldString.rstrip('0')
        except:
          data = datetime.datetime.strptime(fieldString, "%Y-%m-%d %H:%M:%S")
          fieldString = fieldString + ".0"
        field = fieldString
      if dataType[index] == VARCHAR and len(fieldString) > 0 and fieldString[0] in ('{','['):
        # for varchars that begin with curly braces or square brackets, remove
        # spaces and newlines unless they are in a string with double quotes
        newFields = []
        inQuotes = False
        start = 0
        stringLength = len(fieldString)
        for current, character in enumerate(fieldString):
          if character == "\"":
            inQuotes = not(inQuotes)
          atLastChar = (current == stringLength-1);
          if atLastChar == True:
            newFields.append(fieldString[start:])
          elif ((character == ' ') or (character == '\n')) and not(inQuotes):
            newFields.append(fieldString[start:current])
            start = current + 1
        field = "".join(newFields)
      # add field to convertToString
      if convertToString:
        convertToString = convertToString + "\t" + str(field)
      else:
        convertToString = str(field)

    # Save row to output file.  It should match expected results files.
    outputFileH.write("%s\n" % convertToString)

  outputFileH.close()

def ExecuteQuery(queryDir, queryFile, outputFileExt, resultFileCreated):
  queryFileNameRegex = re.compile("(.*)\.(.*)$")
  queryFileName = queryFileNameRegex.search(queryFile).group(1)

  queryStrings = subprocess.Popen("cat %s/%s" % (queryDir, queryFile), shell=True, stdout=subprocess.PIPE).stdout.read()

  # if there are multiple SQL statements, there is only one main SQL query.
  # the first few are used to setup the main query.
  # the last few are used to restore the state of the system after the query is run.
  # If a query starts with "--@test", it is considered the main SQL query.
  # Otherwise, the number of statements before and after the main SQL query is the same.
  # thus we expect an odd number of SQL statements.
  if queryStrings.count(';') > 1:
    # find the main query
    numberStatements = queryStrings.count(';')
    statements = queryStrings.split(';')
    mainStatementIndex = -1
    for index,statement in enumerate(statements):
      statement = statement.lstrip()
      if statement.startswith("--@test"):
        mainStatementIndex = index
        break
    if mainStatementIndex == -1:
      mainStatementIndex = numberStatements / 2

    # execute the setup statements
    for index in range(0,mainStatementIndex):
      queryString = statements[index]
      queryString = queryString.rstrip('\n;')
      try:
        value = cursor.execute(queryString)
      except:
        print "ERROR - generic execute error for setup statements"
    mainStatement = statements[mainStatementIndex]
    multipleStatements = True
  else:
    mainStatement = queryStrings
    multipleStatements = False

  queryString = mainStatement.rstrip(';\n\r\t ')

  try:
    value = cursor.execute(queryString)
  except Exception as inst:
    print "ERROR - generic execute error for main statement: %s" % inst
    failedTests.append(queryFile)

  rows = cursor.fetchall()
  SaveData (rows, queryDir, queryFileName, outputFileExt, resultFileCreated)

  # if there are statements after the main statement, execute them
  # to restore the system
  if multipleStatements:
    for index in range(mainStatementIndex+1, numberStatements):
      queryString = statements[index]
      queryString = queryString.rstrip('\n;')
      try:
        value = cursor.execute(queryString)
      except:
        print "ERROR - generic execute error for cleanup statements"

def GetJSONFileAndExecSingleQuery(queryDir, singleFile):
  # find the json file for the query file
  json_files = glob.glob("%s/*.json" % queryDir)
  if not json_files:
    # look in parent directory for a json file
    updir = ""
    while not json_files:
      updir = "%s/.." % updir
      json_files = glob.glob("%s%s/*.json" % (queryDir, updir))
  for json_file in json_files:
    queryFile, queryFileExt, expectedFile, expectedFileExt, schema, \
      outputFileExt = ParseJSONFile(json_file)
    # execute query file
    if queryFileExt:
      # get query files that match query file extension
      if fnmatch.fnmatch(singleFile, '*%s' % queryFileExt):
        # execute SQL statement to "use" the schema specified by
        # the JSON file
        try:
          value = cursor.execute("use `%s`" % schema)
        except:
          print "execute error for schema"
          raise RuntimeError('cannot use schema %s' % schema)

        #extract query file name from singleFile, which has absolute path
        queryFileRegex = re.compile(".*/(.*)$")
        queryFileRegexSearch = queryFileRegex.search(singleFile)
        queryFile = queryFileRegexSearch.group(1)
        ExecuteQuery(queryDir, queryFile, outputFileExt, resultFileCreated)
    else:
      # if there is no query file extension, then there is a specific 
      # query file specified in the json file.  check if file specified in json
      # file matches the file requested by user.  if so, then execute the requested
      # file using the information in this json file
      if singleFile == '%s/%s' % (queryDir,queryFile):
        # execute SQL statement to "use" the schema specified by
        # the JSON file
        try:
          value = cursor.execute("use `%s`" % schema)
        except:
          print "execute error for schema"
          raise RuntimeError('cannot use schema %s' % schema)

        ExecuteQuery(queryDir, queryFile, outputFileExt, resultFileCreated)

def GetJSONFileAndExecQueries(queryDir):
  for root, dirnames, filenames in os.walk(queryDir):
    # find the json files
    for filename in fnmatch.filter(filenames, '*.json'):
      json_file = os.path.join(queryDir, filename)
      queryFile, queryFileExt, expectedFile, expectedFileExt, schema, \
        outputFileExt = ParseJSONFile(json_file)
      # execute SQL statement to "use" the schema specified by
      # the JSON file
      cursor.execute("use %s" % schema)

      # get list of query files for this json file
      # these will have absolute paths
      if queryFileExt:
        queryFiles = glob.glob("%s/*%s" % (queryDir,queryFileExt))
      else:
        # check if file specified by json file matches the file requested by user
        # if so, then execute the requested file using the information in this
        # json file
        queryFiles = glob.glob("%s/%s" % (queryDir, queryFile))
      queryFileRegex = re.compile(".*/(.*)")
      for queryFileWithPath in queryFiles:
        queryFileRegexSearch = queryFileRegex.search(queryFileWithPath)
        queryFile = queryFileRegexSearch.group(1)
        # execute each query file
        ExecuteQuery(queryDir, queryFile, outputFileExt, resultFileCreated)

# determine if one query is being executed, or a directory of queries.
singleFileTest = False
singleFile = ""
if os.path.isfile(sys.argv[1]):
  singleFile = (sys.argv[1])
  queryDir = os.path.dirname(singleFile)
  singleFileTest = True
elif os.path.isdir(sys.argv[1]):
  queryDir = sys.argv[1]
else:
  print "ERROR - %s needs to be a file or directory" % sys.argv[1]
  sys.exit(-1)

today = datetime.datetime.now()

resultDir = ""
outputFileArg = ""
resultFileCreated = False
outputFileGiven = False
if len(sys.argv) == 3:
  # get output file name
  outputFileArg = sys.argv[2]
  outputFileArg = os.path.join(os.getcwd(), outputFileArg)
  print ("outputFileArg: " + outputFileArg);

  #check if arg2 is an existing directory, in which case the result file goes here
  if os.path.isdir(outputFileArg):
    # create generic output file based on today's date
    resultfile = "ODBC-Test-Result-%s.out" % today.isoformat()
    resultfilepath = "%s/%s" % (resultDir, resultfile)
    resultFileCreated = True

    if os.path.exists(resultfile):
      print "result file %s exists\n" % resultfilepath
      sys.exit(-1)
  else:
    # arg2 is a file, not a directory.  use it as the output file
    outputFileGiven = True

if len(sys.argv) == 4:
  if resultFileCreated:
    print "result file %s has already been created\n" % resultfilepath
    sys.exit(-1)
  resultDir = sys.argv[3]
  print ("resultDir: " + resultDir);
  resultfile = "ODBC-Test-Result-%s.out" % today.isoformat()
  resultfilepath = "%s/%s" % (resultDir, resultfile)
  resultFileCreated = True

  if os.path.exists(resultfile):
    print "result file path %s exists" % resultfilepath
    sys.exit(-1)

# connect to Drill
dbh = pyodbc.connect('DSN=MapR Drill 64-bit', autocommit='True')
cursor = dbh.cursor()

failedTests = []
json_files = []

if singleFileTest:
  GetJSONFileAndExecSingleQuery(queryDir, singleFile)
else:
  GetJSONFileAndExecQueries(queryDir)

cursor.close()
del cursor
dbh.close()  

sys.exit(0)