!nvidia-smi

!pip install -q ipywidgets
!pip install tensorrt_llm -U -q --extra-index-url https://pypi.nvidia.com

!wget https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/tensorrt_llm/models/llama/convert.py
!mv convert.py /usr/local/lib/python3.10/dist-packages/tensorrt_llm/models/llama/

!wget https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/llama/convert_checkpoint.py -P .
!wget https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/run.py -P .
!wget https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/utils.py -P .

# Build the model model with StreamingLLM feature using a single GPU and FP16.
!python convert_checkpoint.py --model_dir mistralai/Mistral-7B-v0.1 \
                         --output_dir ./tllm_checkpoint_1gpu_streamingllm \
                         --dtype float16 \
                         --dense_context_fmha \
                         --enable_pos_shift

# Build the model model with StreamingLLM feature using a single GPU and FP16.
!python convert_checkpoint.py --model_dir mistralai/Mistral-7B-v0.1 \
                         --output_dir ./tllm_checkpoint_1gpu_nostream \
                         --dtype float16

# Streaming 
!trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_streamingllm \
            --output_dir ./mistralengine_streaming \
            --gemm_plugin float16

import requests
import re

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

response = requests.get(url)

if response.status_code == 200:
    story = response.text
    story = re.sub('\s+', ' ', story).strip()
else:
    story = None
    print("Failed to retrieve the document.")

%%time 

# Use the streaming engine with a sliding window/cache size 2048 and sink token length 4 
!python3 ./run.py --max_output_len=150 \
                  --tokenizer_dir mistralai/Mistral-7B-v0.1 \
                  --engine_dir=./mistralengine_streaming \
                  --max_attention_window_size=4096 \
                  --sink_token_length=4 \
                  --input_text f"{story[983152:]}"

Using TensorRT-LLM and StreamingLLM for Efficient Inference on Mistral¶

Introduction to StreamingLLM¶

Credits¶

Install TensorRT-LLM¶

Convert Mistral to the TensorRT format¶

Build the TensorRT engine for the model¶

Run inference with a large input sequence¶