From 081c5f2696331e7fe586332b8de57e9683ab9eb1 Mon Sep 17 00:00:00 2001
From: Philip Stern <pstern@stainlessapi.com>
Date: Fri, 21 Jun 2024 18:26:05 -0400
Subject: [PATCH 1/2] Adding Chat Completion Types

---
 openapi.yaml | 380 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 258 insertions(+), 122 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 8ac8f20..9d2c1ff 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1,6 +1,6 @@
 openapi: 3.1.0
 info:
-  title: Together APIs
+  title: Together API
   description: The Together REST API. Please see https://docs.together.ai for more details.
   version: '2.0.0'
   termsOfService: https://www.together.ai/terms-of-service
@@ -17,9 +17,7 @@ security:
 paths:
   /chat/completions:
     post:
-      tags: ['Chat']
-      summary: Create chat completion
-      description: Query a chat model.
+      summary: Creates a model response for the given chat conversation.
       operationId: chat-completions
       requestBody:
         content:
@@ -75,9 +73,7 @@ paths:
       deprecated: false
   /completions:
     post:
-      tags: ['Completion']
-      summary: Create completion
-      description: Query a language, code, or image model.
+      summary: Creates a completion for the provided prompt and parameters
       operationId: completions
       requestBody:
         content:
@@ -133,9 +129,7 @@ paths:
       deprecated: false
   /embeddings:
     post:
-      tags: ['Embeddings']
-      summary: Create embedding
-      description: Query an embedding model for a given string of text.
+      summary: Creates an embedding vector representing the input text
       operationId: embeddings
       requestBody:
         content:
@@ -188,9 +182,7 @@ paths:
       deprecated: false
   /models:
     get:
-      tags: ['Models']
-      summary: List all models
-      description: Lists all of Together's open-source models
+      summary: Lists all the available models
       operationId: models
       responses:
         '200':
@@ -238,9 +230,7 @@ paths:
       deprecated: false
   /images/generations:
     post:
-      tags: ['Images']
-      summary: Create image
-      description: Use an image model to generate an image for a given prompt.
+      summary: Generate images based on a given prompt using a specified model
       requestBody:
         required: true
         content:
@@ -290,9 +280,7 @@ paths:
                 $ref: '#/components/schemas/ImageResponse'
   /files:
     get:
-      tags: ['Files']
       summary: List all files
-      description: List the metadata for all uploaded data files.
       responses:
         '200':
           description: List of files
@@ -302,9 +290,7 @@ paths:
                 $ref: '#/components/schemas/FileList'
   /files/{id}:
     get:
-      tags: ['Files']
-      summary: List file
-      description: List the metadata for a single uploaded data file.
+      summary: Retrieve a file
       parameters:
         - name: id
           in: path
@@ -319,9 +305,7 @@ paths:
               schema:
                 $ref: '#/components/schemas/FileResponse'
     delete:
-      tags: ['Files']
       summary: Delete a file
-      description: Delete a previously uploaded data file.
       parameters:
         - name: id
           in: path
@@ -337,9 +321,7 @@ paths:
                 $ref: '#/components/schemas/FileDeleteResponse'
   /files/{id}/content:
     get:
-      tags: ['Files']
-      summary: Get file contents
-      description: Get the contents of a single uploaded data file.
+      summary: Retrieve file content
       parameters:
         - name: id
           in: path
@@ -355,9 +337,7 @@ paths:
                 $ref: '#/components/schemas/FileObject'
   /fine-tunes:
     post:
-      tags: ['Fine-tuning']
-      summary: Create job
-      description: Use a model to create a fine-tuning job.
+      summary: Create a fine-tuning job
       requestBody:
         required: true
         content:
@@ -405,9 +385,7 @@ paths:
               schema:
                 $ref: '#/components/schemas/FinetuneResponse'
     get:
-      tags: ['Fine-tuning']
-      summary: List all jobs
-      description: List the metadata for all fine-tuning jobs.
+      summary: List fine-tune job history
       responses:
         '200':
           description: List of fine-tune jobs
@@ -417,9 +395,7 @@ paths:
                 $ref: '#/components/schemas/FinetuneList'
   /fine-tunes/{id}:
     get:
-      tags: ['Fine-tuning']
-      summary: List job
-      description: List the metadata for a single fine-tuning job.
+      summary: Retrieve fine-tune job details
       parameters:
         - name: id
           in: path
@@ -435,9 +411,7 @@ paths:
                 $ref: '#/components/schemas/FinetuneResponse'
   /fine-tunes/{id}/events:
     get:
-      tags: ['Fine-tuning']
-      summary: List job events
-      description: List the events for a single fine-tuning job.
+      summary: List events of a fine-tune job
       parameters:
         - name: id
           in: path
@@ -453,9 +427,7 @@ paths:
                 $ref: '#/components/schemas/FinetuneListEvents'
   /finetune/download:
     get:
-      tags: ['Fine-tuning']
-      summary: Download model
-      description: Download a compressed fine-tuned model or checkpoint to local disk.
+      summary: Downloads a compressed fine-tuned model or checkpoint to local disk.
       parameters:
         - in: query
           name: ft_id
@@ -488,9 +460,7 @@ paths:
           description: Fine-tune ID not found.
   /fine-tunes/{id}/cancel:
     post:
-      tags: ['Fine-tuning']
-      summary: Cancel job
-      description: Cancel a currently running fine-tuning job.
+      summary: Cancels a running fine-tuning job.
       parameters:
         - in: path
           name: id
@@ -509,15 +479,11 @@ paths:
           description: Invalid request parameters.
         '404':
           description: Fine-tune ID not found.
-
 components:
   securitySchemes:
     bearerAuth:
-      type: apiKey
-      in: header
-      name: Authorization
-      x-bearer-format: bearer
-      x-default: default
+      type: http
+      scheme: bearer
 
   schemas:
     ErrorData:
@@ -555,10 +521,24 @@ components:
         - eos
         - length
         - tool_calls
+        - function_call
+
+    LogProbItem:
+      type: object
+      required: [token, logprob]
+      properties:
+        token:
+          type: string
+        logprob:
+          type: number
 
     LogprobsPart:
       type: object
       properties:
+        content:
+          type: array
+          items:
+            $ref: '#/components/schemas/LogProbItem'
         tokens:
           type: array
           items:
@@ -568,7 +548,6 @@ components:
           type: array
           items:
             type: number
-            format: float
           description: List of token log probabilities
 
     PromptPart:
@@ -608,9 +587,8 @@ components:
           finish_reason:
             $ref: '#/components/schemas/FinishReason'
           logprobs:
-            allOf:
-              - $ref: '#/components/schemas/LogprobsPart'
-              - nullable: true
+            type: object
+            $ref: '#/components/schemas/LogprobsPart'
 
     CompletionRequest:
       type: object
@@ -631,20 +609,18 @@ components:
           description: The maximum number of tokens to generate.
         stop:
           type: array
-          description: A list of string sequences that will truncate (stop) inference text output. For example, "</s>" will stop generation as soon as the model generates the given token.
+          description: A list of string sequences that will truncate (stop) inference text output.
           items:
             type: string
         temperature:
           type: number
-          description: A decimal number from 0-1 that determines the degree of randomness in the response. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value closer to 1 introduces more randomness in the output.
-          format: float
+          description: Determines the degree of randomness in the response.
         top_p:
           type: number
-          description: A percentage (also called the nucleus parameter) that's used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold below which all less likely tokens are filtered out. This technique helps maintain diversity and generate more fluent and natural-sounding text.
-          format: float
+          description: The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities.
         top_k:
           type: integer
-          description: An integer that's used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
+          description: The `top_k` parameter is used to limit the number of choices for the next predicted word or token.
           format: int32
         repetition_penalty:
           type: number
@@ -652,42 +628,38 @@ components:
           format: float
         stream:
           type: boolean
-          description: 'If true, stream tokens as Server-Sent Events as the model generates them instead of waiting for the full model response. The stream terminates with `data: [DONE]`. If false, return a single JSON object containing the results.'
+          description: 'If set, tokens are returned as Server-Sent Events as they are made available. Stream terminates with `data: [DONE]`'
         logprobs:
           type: integer
           minimum: 0
           maximum: 1
-          description: Determines the number of most likely tokens to return at each token position log probabilities to return.
+          description: Determines the number of most likely tokens to return at each token position log probabilities to return
         echo:
           type: boolean
-          description: If true, the response will contain the prompt. Can be used with `logprobs` to return prompt logprobs.
+          description: If set, the response will contain the prompt, and will also return prompt logprobs if set with logprobs.
         n:
           type: integer
-          description: The number of completions to generate for each prompt.
+          description: Number of generations to return
           minimum: 1
           maximum: 128
         safety_model:
           type: string
-          description: The name of the moderation model used to validate tokens. Choose from the available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          description: The name of the safety model to use.
           example: 'safety_model_name'
         min_p:
           type: number
-          description: A number between 0 and 1 that can be used as an alternative to temperature.
-          format: float
+          description: The `min_p` parameter is a number between 0 and 1 and an alternative to `temperature`.
         presence_penalty:
           type: number
-          description: A number between -2.0 and 2.0 where a positive value increases the likelihood of a model talking about new topics.
-          format: float
+          description: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a positive value will increase the likelihood of a model talking about new topics.
         frequency_penalty:
           type: number
-          description: A number between -2.0 and 2.0 where a positive value decreases the likelihood of repeating tokens that have already been mentioned.
-          format: float
+          description: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a positive value will decrease the likelihood of repeating tokens that were mentioned prior.
         logit_bias:
           type: object
           additionalProperties:
             type: number
-            format: float
-          description: Adjusts the likelihood of specific tokens appearing in the generated output.
+          description: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens appearing in the generated output.
           example: { '1024': -10.5, '105': 21.4 }
     CompletionResponse:
       type: object
@@ -767,7 +739,6 @@ components:
           type: string
         logprob:
           type: number
-          format: float
         special:
           type: boolean
 
@@ -776,20 +747,61 @@ components:
       items:
         type: object
         properties:
-          message:
-            type: object
-            properties:
-              role:
-                type: string
-                example: assistant
-              content:
-                type: string
+          text:
+            type: string
+          index:
+            type: integer
+          seed:
+            type: integer
           finish_reason:
             $ref: '#/components/schemas/FinishReason'
+          message:
+            $ref: '#/components/schemas/ChatCompletionMessage'
           logprobs:
             allOf:
               - nullable: true
               - $ref: '#/components/schemas/LogprobsPart'
+    ChatCompletionMessage:
+      type: object
+      required: [role, content]
+      properties:
+        content:
+          type: string
+          nullable: true
+        role:
+          type: string
+          enum: [assistant]
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolChoice'
+        function_call:
+          type: object
+          deprecated: true
+          required: [arguments, name]
+          properties:
+            arguments:
+              type: string
+            name:
+              type: string
+    ChatCompletionTool:
+      type: object
+      required: [type, function]
+      properties:
+        type:
+          type: string
+          enum: ['function']
+        function:
+          type: object
+          required: [name]
+          properties:
+            description:
+              type: string
+            name:
+              type: string
+            parameters:
+              type: object
+              additionalProperties: true
 
     ChatCompletionRequest:
       type: object
@@ -825,63 +837,66 @@ components:
           description: The maximum number of tokens to generate.
         stop:
           type: array
-          description: A list of string sequences that will truncate (stop) inference text output. For example, "</s>" will stop generation as soon as the model generates the given token.
+          description: A list of string sequences that will truncate (stop) inference text output.
           items:
             type: string
         temperature:
           type: number
-          description: A decimal number from 0-1 that determines the degree of randomness in the response. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value closer to 1 introduces more randomness in the output.
-          format: float
+          description: Determines the degree of randomness in the response.
         top_p:
           type: number
-          description: A percentage (also called the nucleus parameter) that's used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold below which all less likely tokens are filtered out. This technique helps maintain diversity and generate more fluent and natural-sounding text.
-          format: float
+          description: The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities.
         top_k:
           type: integer
-          description: An integer that's used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
+          description: The `top_k` parameter is used to limit the number of choices for the next predicted word or token.
           format: int32
         repetition_penalty:
           type: number
           description: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-          format: float
         stream:
           type: boolean
-          description: 'If true, stream tokens as Server-Sent Events as the model generates them instead of waiting for the full model response. The stream terminates with `data: [DONE]`. If false, return a single JSON object containing the results.'
+          description: 'If set, tokens are returned as Server-Sent Events as they are made available. Stream terminates with `data: [DONE]`'
         logprobs:
           type: integer
           minimum: 0
           maximum: 1
-          description: Determines the number of most likely tokens to return at each token position log probabilities to return.
+          description: Determines the number of most likely tokens to return at each token position log probabilities to return
         echo:
           type: boolean
-          description: If true, the response will contain the prompt. Can be used with `logprobs` to return prompt logprobs.
+          description: If set, the response will contain the prompt, and will also return prompt logprobs if set with logprobs.
         n:
           type: integer
-          description: The number of completions to generate for each prompt.
+          description: Number of generations to return
           minimum: 1
           maximum: 128
         min_p:
           type: number
-          description: A number between 0 and 1 that can be used as an alternative to temperature.
-          format: float
+          description: The `min_p` parameter is a number between 0 and 1 and an alternative to `temperature`.
         presence_penalty:
           type: number
-          description: A number between -2.0 and 2.0 where a positive value increases the likelihood of a model talking about new topics.
-          format: float
+          description: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a positive value will increase the likelihood of a model talking about new topics.
         frequency_penalty:
           type: number
-          description: A number between -2.0 and 2.0 where a positive value decreases the likelihood of repeating tokens that have already been mentioned.
-          format: float
+          description: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a positive value will decrease the likelihood of repeating tokens that were mentioned prior.
         logit_bias:
           type: object
           additionalProperties:
             type: number
-            format: float
-          description: Adjusts the likelihood of specific tokens appearing in the generated output.
+          description: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens appearing in the generated output.
           example: { '1024': -10.5, '105': 21.4 }
+        function_call:
+          oneOf:
+            - type: string
+              enum: [none, auto]
+            - type: object
+              required: [name]
+              properties:
+                name:
+                  type: string
+
         response_format:
           type: object
-          description: An object specifying the format that the model must output.
+          description: Specifies the format of the response.
           properties:
             type:
               type: string
@@ -894,21 +909,109 @@ components:
               description: The schema of the response format.
         tools:
           type: array
-          description: A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for.
+          description: A list of tools to be used in the query.
           items:
             $ref: '#/components/schemas/ToolsPart'
         tool_choice:
           type: object
-          description: Controls which (if any) function is called by the model. By default uses `auto`, which lets the model pick between generating a message or calling a function.
+          description: The choice of tool to use.
           oneOf:
             - type: string
               example: 'tool_name'
             - $ref: '#/components/schemas/ToolChoice'
         safety_model:
           type: string
-          description: The name of the moderation model used to validate tokens. Choose from the available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
+          description: The name of the safety model to use.
           example: 'safety_model_name'
 
+    ChatCompletionMessageParam:
+      oneOf:
+        - $ref: '#/components/schemas/ChatCompletionSystemMessageParam'
+        - $ref: '#/components/schemas/ChatCompletionUserMessageParam'
+        - $ref: '#/components/schemas/ChatCompletionAssistantMessageParam'
+        - $ref: '#/components/schemas/ChatCompletionToolMessageParam'
+        - $ref: '#/components/schemas/ChatCompletionFunctionMessageParam'
+
+    # Start Message Params
+
+    ChatCompletionSystemMessageParam:
+      type: object
+      required: [content, role]
+      properties:
+        content:
+          type: string
+        role:
+          type: string
+          enum: ['system']
+        name:
+          type: string
+
+    ChatCompletionUserMessageParam:
+      type: object
+      required: [content, role]
+      properties:
+        content:
+          type: string
+          # TODO: more comple content?
+        role:
+          type: string
+          enum: ['user']
+        name:
+          type: string
+
+    ChatCompletionAssistantMessageParam:
+      type: object
+      required: [role]
+      properties:
+        content:
+          type: string
+          nullable: true
+        role:
+          type: string
+          enum: ['assistant']
+        name:
+          type: string
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolChoice'
+        function_call:
+          type: object
+          deprecated: true
+          properties:
+            arguments:
+              type: string
+            name:
+              type: string
+          required: [arguments, name]
+
+    ChatCompletionFunctionMessageParam:
+      type: object
+      deprecated: true
+      required: [content, role, name]
+      properties:
+        role:
+          type: string
+          enum: ['function']
+        content:
+          type: string
+        name:
+          type: string
+
+    ChatCompletionToolMessageParam:
+      type: object
+      properties:
+        role:
+          type: string
+          enum: ['tool']
+        content:
+          type: string
+        tool_call_id:
+          type: string
+        required: [role, content, tool_call_id]
+
+    # End Message Params
+
     ChatCompletionResponse:
       type: object
       properties:
@@ -926,6 +1029,7 @@ components:
           type: string
           enum:
             - chat.completion
+      required: [choices, id, created, model, object]
 
     ChatCompletionStream:
       oneOf:
@@ -941,7 +1045,7 @@ components:
 
     ChatCompletionChunk:
       type: object
-      required: [id, object, created, token, choices]
+      required: [id, object, created, choices, model]
       properties:
         id:
           type: string
@@ -951,8 +1055,11 @@ components:
             - chat.completion.chunk
         created:
           type: integer
-        token:
-          $ref: '#/components/schemas/ChatCompletionToken'
+        system_fingerprint:
+          type: string
+        model:
+          type: string
+          example: mistralai/Mixtral-8x7B-Instruct-v0.1
         choices:
           title: ChatCompletionChoices
           type: array
@@ -962,10 +1069,6 @@ components:
           allOf:
             - $ref: '#/components/schemas/UsageData'
             - nullable: true
-        finish_reason:
-          allOf:
-            - $ref: '#/components/schemas/FinishReason'
-            - nullable: true
 
     StreamSentinel:
       type: object
@@ -993,17 +1096,43 @@ components:
 
     ChatCompletionChoice:
       type: object
-      required: [index, delta]
+      required: [index, delta, finish_reason]
       properties:
         index:
           type: integer
+        finish_reason:
+          $ref: '#/components/schemas/FinishReason'
+        logprobs:
+          $ref: '#/components/schemas/LogprobsPart'
         delta:
           title: ChatCompletionChoiceDelta
           type: object
-          required: [content]
+          requied: [role]
           properties:
+            token_id:
+              type: integer
+            role:
+              type: string
+              enum: ['system', 'user', 'assistant', 'function', 'tool']
             content:
               type: string
+              nullable: true
+            tool_calls:
+              type: array
+              items:
+                $ref: '#/components/schemas/ToolChoice'
+            function_call:
+              type: object
+              deprecated: true
+              nullable: true
+              properties:
+                arguments:
+                  type: string
+                name:
+                  type: string
+              required:
+                - arguments
+                - name
 
     EmbeddingsRequest:
       type: object
@@ -1126,23 +1255,18 @@ components:
       properties:
         hourly:
           type: number
-          format: float
           example: 0
         input:
           type: number
-          format: float
           example: 0.3
         output:
           type: number
-          format: float
           example: 0.3
         base:
           type: number
-          format: float
           example: 0
         finetune:
           type: number
-          format: float
           example: 0
 
     ToolsPart:
@@ -1166,16 +1290,25 @@ components:
               description: 'A map of parameter names to their values.'
     ToolChoice:
       type: object
+      required: [id, type, function, index]
       properties:
+        # TODO: is this the right place for index?
+        index:
+          type: number
+        id:
+          type: string
         type:
           type: string
-          example: 'tool_choice_type'
+          enum: ['function']
         function:
           type: object
+          required: [name, arguments]
           properties:
             name:
               type: string
               example: 'function_name'
+            arguments:
+              type: string
 
     FileResponse:
       type: object
@@ -1331,11 +1464,12 @@ components:
         object:
           type: string
           enum:
-            - FinetuneEvent
+            - 'FinetuneEvent'
         created_at:
           type: string
         level:
-          $ref: '#/components/schemas/FinetuneEventLevels'
+          anyOf:
+            - $ref: '#/components/schemas/FinetuneEventLevels'
         message:
           type: string
         type:
@@ -1428,6 +1562,7 @@ components:
         created_at:
           type: string
         level:
+          type: string
           enum:
             - null
             - info
@@ -1439,6 +1574,7 @@ components:
         message:
           type: string
         type:
+          type: string
           enum:
             - job_pending
             - job_start

From 25a188151bc58e38fcdf22881840e86886220169 Mon Sep 17 00:00:00 2001
From: Philip Stern <pstern@stainlessapi.com>
Date: Fri, 21 Jun 2024 18:29:25 -0400
Subject: [PATCH 2/2] Merging changes.

---
 openapi.yaml | 138 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 93 insertions(+), 45 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 9d2c1ff..ec05810 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1,6 +1,6 @@
 openapi: 3.1.0
 info:
-  title: Together API
+  title: Together APIs
   description: The Together REST API. Please see https://docs.together.ai for more details.
   version: '2.0.0'
   termsOfService: https://www.together.ai/terms-of-service
@@ -17,7 +17,9 @@ security:
 paths:
   /chat/completions:
     post:
-      summary: Creates a model response for the given chat conversation.
+      tags: ['Chat']
+      summary: Create chat completion
+      description: Query a chat model.
       operationId: chat-completions
       requestBody:
         content:
@@ -73,7 +75,9 @@ paths:
       deprecated: false
   /completions:
     post:
-      summary: Creates a completion for the provided prompt and parameters
+      tags: ['Completion']
+      summary: Create completion
+      description: Query a language, code, or image model.
       operationId: completions
       requestBody:
         content:
@@ -129,7 +133,9 @@ paths:
       deprecated: false
   /embeddings:
     post:
-      summary: Creates an embedding vector representing the input text
+      tags: ['Embeddings']
+      summary: Create embedding
+      description: Query an embedding model for a given string of text.
       operationId: embeddings
       requestBody:
         content:
@@ -182,7 +188,9 @@ paths:
       deprecated: false
   /models:
     get:
-      summary: Lists all the available models
+      tags: ['Models']
+      summary: List all models
+      description: Lists all of Together's open-source models
       operationId: models
       responses:
         '200':
@@ -230,7 +238,9 @@ paths:
       deprecated: false
   /images/generations:
     post:
-      summary: Generate images based on a given prompt using a specified model
+      tags: ['Images']
+      summary: Create image
+      description: Use an image model to generate an image for a given prompt.
       requestBody:
         required: true
         content:
@@ -280,7 +290,9 @@ paths:
                 $ref: '#/components/schemas/ImageResponse'
   /files:
     get:
+      tags: ['Files']
       summary: List all files
+      description: List the metadata for all uploaded data files.
       responses:
         '200':
           description: List of files
@@ -290,7 +302,9 @@ paths:
                 $ref: '#/components/schemas/FileList'
   /files/{id}:
     get:
-      summary: Retrieve a file
+      tags: ['Files']
+      summary: List file
+      description: List the metadata for a single uploaded data file.
       parameters:
         - name: id
           in: path
@@ -305,7 +319,9 @@ paths:
               schema:
                 $ref: '#/components/schemas/FileResponse'
     delete:
+      tags: ['Files']
       summary: Delete a file
+      description: Delete a previously uploaded data file.
       parameters:
         - name: id
           in: path
@@ -321,7 +337,9 @@ paths:
                 $ref: '#/components/schemas/FileDeleteResponse'
   /files/{id}/content:
     get:
-      summary: Retrieve file content
+      tags: ['Files']
+      summary: Get file contents
+      description: Get the contents of a single uploaded data file.
       parameters:
         - name: id
           in: path
@@ -337,7 +355,9 @@ paths:
                 $ref: '#/components/schemas/FileObject'
   /fine-tunes:
     post:
-      summary: Create a fine-tuning job
+      tags: ['Fine-tuning']
+      summary: Create job
+      description: Use a model to create a fine-tuning job.
       requestBody:
         required: true
         content:
@@ -385,7 +405,9 @@ paths:
               schema:
                 $ref: '#/components/schemas/FinetuneResponse'
     get:
-      summary: List fine-tune job history
+      tags: ['Fine-tuning']
+      summary: List all jobs
+      description: List the metadata for all fine-tuning jobs.
       responses:
         '200':
           description: List of fine-tune jobs
@@ -395,7 +417,9 @@ paths:
                 $ref: '#/components/schemas/FinetuneList'
   /fine-tunes/{id}:
     get:
-      summary: Retrieve fine-tune job details
+      tags: ['Fine-tuning']
+      summary: List job
+      description: List the metadata for a single fine-tuning job.
       parameters:
         - name: id
           in: path
@@ -411,7 +435,9 @@ paths:
                 $ref: '#/components/schemas/FinetuneResponse'
   /fine-tunes/{id}/events:
     get:
-      summary: List events of a fine-tune job
+      tags: ['Fine-tuning']
+      summary: List job events
+      description: List the events for a single fine-tuning job.
       parameters:
         - name: id
           in: path
@@ -427,7 +453,9 @@ paths:
                 $ref: '#/components/schemas/FinetuneListEvents'
   /finetune/download:
     get:
-      summary: Downloads a compressed fine-tuned model or checkpoint to local disk.
+      tags: ['Fine-tuning']
+      summary: Download model
+      description: Download a compressed fine-tuned model or checkpoint to local disk.
       parameters:
         - in: query
           name: ft_id
@@ -460,7 +488,9 @@ paths:
           description: Fine-tune ID not found.
   /fine-tunes/{id}/cancel:
     post:
-      summary: Cancels a running fine-tuning job.
+      tags: ['Fine-tuning']
+      summary: Cancel job
+      description: Cancel a currently running fine-tuning job.
       parameters:
         - in: path
           name: id
@@ -479,11 +509,15 @@ paths:
           description: Invalid request parameters.
         '404':
           description: Fine-tune ID not found.
+
 components:
   securitySchemes:
     bearerAuth:
-      type: http
-      scheme: bearer
+      type: apiKey
+      in: header
+      name: Authorization
+      x-bearer-format: bearer
+      x-default: default
 
   schemas:
     ErrorData:
@@ -609,18 +643,20 @@ components:
           description: The maximum number of tokens to generate.
         stop:
           type: array
-          description: A list of string sequences that will truncate (stop) inference text output.
+          description: A list of string sequences that will truncate (stop) inference text output. For example, "</s>" will stop generation as soon as the model generates the given token.
           items:
             type: string
         temperature:
           type: number
-          description: Determines the degree of randomness in the response.
+          description: A decimal number from 0-1 that determines the degree of randomness in the response. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value closer to 1 introduces more randomness in the output.
+          format: float
         top_p:
           type: number
-          description: The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities.
+          description: A percentage (also called the nucleus parameter) that's used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold below which all less likely tokens are filtered out. This technique helps maintain diversity and generate more fluent and natural-sounding text.
+          format: float
         top_k:
           type: integer
-          description: The `top_k` parameter is used to limit the number of choices for the next predicted word or token.
+          description: An integer that's used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
           format: int32
         repetition_penalty:
           type: number
@@ -628,38 +664,43 @@ components:
           format: float
         stream:
           type: boolean
-          description: 'If set, tokens are returned as Server-Sent Events as they are made available. Stream terminates with `data: [DONE]`'
+          description: 'If true, stream tokens as Server-Sent Events as the model generates them instead of waiting for the full model response. The stream terminates with `data: [DONE]`. If false, return a single JSON object containing the results.'
         logprobs:
           type: integer
           minimum: 0
           maximum: 1
-          description: Determines the number of most likely tokens to return at each token position log probabilities to return
+          description: Determines the number of most likely tokens to return at each token position log probabilities to return.
         echo:
           type: boolean
-          description: If set, the response will contain the prompt, and will also return prompt logprobs if set with logprobs.
+          description: If true, the response will contain the prompt. Can be used with `logprobs` to return prompt logprobs.
         n:
           type: integer
-          description: Number of generations to return
+          description: The number of completions to generate for each prompt.
           minimum: 1
           maximum: 128
         safety_model:
           type: string
-          description: The name of the safety model to use.
+          description: The name of the moderation model used to validate tokens. Choose from the available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
           example: 'safety_model_name'
         min_p:
           type: number
-          description: The `min_p` parameter is a number between 0 and 1 and an alternative to `temperature`.
+          description: A number between 0 and 1 that can be used as an alternative to temperature.
+          format: float
         presence_penalty:
           type: number
-          description: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a positive value will increase the likelihood of a model talking about new topics.
+          description: A number between -2.0 and 2.0 where a positive value increases the likelihood of a model talking about new topics.
+          format: float
         frequency_penalty:
           type: number
-          description: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a positive value will decrease the likelihood of repeating tokens that were mentioned prior.
+          description: A number between -2.0 and 2.0 where a positive value decreases the likelihood of repeating tokens that have already been mentioned.
+          format: float
         logit_bias:
           type: object
           additionalProperties:
             type: number
-          description: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens appearing in the generated output.
+
+            format: float
+          description: Adjusts the likelihood of specific tokens appearing in the generated output.
           example: { '1024': -10.5, '105': 21.4 }
     CompletionResponse:
       type: object
@@ -837,52 +878,59 @@ components:
           description: The maximum number of tokens to generate.
         stop:
           type: array
-          description: A list of string sequences that will truncate (stop) inference text output.
+          description: A list of string sequences that will truncate (stop) inference text output. For example, "</s>" will stop generation as soon as the model generates the given token.
           items:
             type: string
         temperature:
           type: number
-          description: Determines the degree of randomness in the response.
+          description: A decimal number from 0-1 that determines the degree of randomness in the response. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value closer to 1 introduces more randomness in the output.
+          format: float
         top_p:
           type: number
-          description: The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities.
+          description: A percentage (also called the nucleus parameter) that's used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold below which all less likely tokens are filtered out. This technique helps maintain diversity and generate more fluent and natural-sounding text.
+          format: float
         top_k:
           type: integer
-          description: The `top_k` parameter is used to limit the number of choices for the next predicted word or token.
+          description: An integer that's used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
           format: int32
         repetition_penalty:
           type: number
           description: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
         stream:
           type: boolean
-          description: 'If set, tokens are returned as Server-Sent Events as they are made available. Stream terminates with `data: [DONE]`'
+          description: 'If true, stream tokens as Server-Sent Events as the model generates them instead of waiting for the full model response. The stream terminates with `data: [DONE]`. If false, return a single JSON object containing the results.'
         logprobs:
           type: integer
           minimum: 0
           maximum: 1
-          description: Determines the number of most likely tokens to return at each token position log probabilities to return
+          description: Determines the number of most likely tokens to return at each token position log probabilities to return.
         echo:
           type: boolean
-          description: If set, the response will contain the prompt, and will also return prompt logprobs if set with logprobs.
+          description: If true, the response will contain the prompt. Can be used with `logprobs` to return prompt logprobs.
         n:
           type: integer
-          description: Number of generations to return
+          description: The number of completions to generate for each prompt.
           minimum: 1
           maximum: 128
         min_p:
           type: number
-          description: The `min_p` parameter is a number between 0 and 1 and an alternative to `temperature`.
+
+          description: A number between 0 and 1 that can be used as an alternative to temperature.
+          format: float
         presence_penalty:
           type: number
-          description: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a positive value will increase the likelihood of a model talking about new topics.
+          description: A number between -2.0 and 2.0 where a positive value increases the likelihood of a model talking about new topics.
+          format: float
         frequency_penalty:
           type: number
-          description: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a positive value will decrease the likelihood of repeating tokens that were mentioned prior.
+          description: A number between -2.0 and 2.0 where a positive value decreases the likelihood of repeating tokens that have already been mentioned.
+          format: float
         logit_bias:
           type: object
           additionalProperties:
             type: number
-          description: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens appearing in the generated output.
+            format: float
+          description: Adjusts the likelihood of specific tokens appearing in the generated output.
           example: { '1024': -10.5, '105': 21.4 }
         function_call:
           oneOf:
@@ -896,7 +944,7 @@ components:
 
         response_format:
           type: object
-          description: Specifies the format of the response.
+          description: An object specifying the format that the model must output.
           properties:
             type:
               type: string
@@ -909,19 +957,19 @@ components:
               description: The schema of the response format.
         tools:
           type: array
-          description: A list of tools to be used in the query.
+          description: A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for.
           items:
             $ref: '#/components/schemas/ToolsPart'
         tool_choice:
           type: object
-          description: The choice of tool to use.
+          description: Controls which (if any) function is called by the model. By default uses `auto`, which lets the model pick between generating a message or calling a function.
           oneOf:
             - type: string
               example: 'tool_name'
             - $ref: '#/components/schemas/ToolChoice'
         safety_model:
           type: string
-          description: The name of the safety model to use.
+          description: The name of the moderation model used to validate tokens. Choose from the available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
           example: 'safety_model_name'
 
     ChatCompletionMessageParam: