Gemfury

team-dmm / auto_gptq python

Repository URL to install this package:
Details
auto_gptq / test_awq_compatibility_generation.py
# ruff: noqa: I001
import unittest

import torch
import autogptq_cuda_64
import autogptq_cuda_256
from transformers import AutoTokenizer

from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQLinear


try:
    from awq import AutoAWQForCausalLM
except ModuleNotFoundError as e:
    AutoAWQForCausalLM = None
    AWQ_EXCEPTION = e


class TestAwqCompatibility(unittest.TestCase):
    # TODO: test cuda-old fp16.
    # TODO: test cuda-old fp32.
    # TODO: test exllama v2.

    def test_generation_cuda_old_fp32_pytorch(self):
        if AutoAWQForCausalLM is None:
            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")

        device = torch.device("cuda:0")
        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"

        model_autogptq = AutoGPTQForCausalLM.from_quantized(
            quant_path,
            device=device,
            use_triton=False,
            inject_fused_attention=False,
            inject_fused_mlp=False,
            disable_exllama=True,
            disable_exllamav2=True,
            torch_dtype=torch.float32,
        )
        tokenizer = AutoTokenizer.from_pretrained(quant_path)

        prompt = "I am in Paris and I am going to see the"

        inp = tokenizer(prompt, return_tensors="pt").to(device)

        for name, submodule in model_autogptq.named_modules():
            if isinstance(submodule, CudaOldQLinear):
                # Just a hack to test the handmade pytorch implementation path.
                submodule.autogptq_cuda_available = False

        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
        autogptq_output = tokenizer.decode(autogptq_output[0])

        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)

        awq_output = model_awq.generate(
            **inp,
            num_beams=1,
            min_new_tokens=30,
            max_new_tokens=30,
        )

        awq_output = tokenizer.decode(awq_output[0])

        self.assertTrue(awq_output == autogptq_output)

    def test_generation_cuda_old_cuda_256(self):
        if AutoAWQForCausalLM is None:
            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")

        device = torch.device("cuda:0")
        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"

        tokenizer = AutoTokenizer.from_pretrained(quant_path)
        prompt = "I am in Paris and I am going to see the"

        for torch_dtype in [torch.float16, torch.float32]:
            model_autogptq = AutoGPTQForCausalLM.from_quantized(
                quant_path,
                device=device,
                use_triton=False,
                inject_fused_attention=False,
                inject_fused_mlp=False,
                disable_exllama=True,
                disable_exllamav2=True,
                torch_dtype=torch_dtype,
            )

            for name, module in model_autogptq.named_modules():
                if isinstance(module, CudaOldQLinear):
                    self.assertTrue(module.autogptq_cuda == autogptq_cuda_256)

                    if torch_dtype == torch.float32:
                        self.assertFalse(module.use_cuda_fp16)
                    else:
                        self.assertTrue(module.use_cuda_fp16)

            inp = tokenizer(prompt, return_tensors="pt").to(device)

            autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
            autogptq_output = tokenizer.decode(autogptq_output[0])

            model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)

            awq_output = model_awq.generate(
                **inp,
                num_beams=1,
                min_new_tokens=30,
                max_new_tokens=30,
            )

            awq_output = tokenizer.decode(awq_output[0])

            self.assertTrue(awq_output == autogptq_output)

    def test_generation_cuda_old_cuda_64(self):
        if AutoAWQForCausalLM is None:
            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")

        device = torch.device("cuda:0")
        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"

        tokenizer = AutoTokenizer.from_pretrained(quant_path)
        prompt = "I am in Paris and I am going to see the"

        for torch_dtype in [torch.float16, torch.float32]:
            model_autogptq = AutoGPTQForCausalLM.from_quantized(
                quant_path,
                device=device,
                use_triton=False,
                inject_fused_attention=False,
                inject_fused_mlp=False,
                disable_exllama=True,
                disable_exllamav2=True,
                torch_dtype=torch_dtype,
            )

            # Force autogptq_cuda_64.
            for name, module in model_autogptq.named_modules():
                if isinstance(module, CudaOldQLinear):
                    if module.autogptq_cuda != autogptq_cuda_64:
                        module.autogptq_cuda = autogptq_cuda_64

                    if torch_dtype == torch.float32:
                        self.assertFalse(module.use_cuda_fp16)
                    else:
                        self.assertTrue(module.use_cuda_fp16)

            inp = tokenizer(prompt, return_tensors="pt").to(device)

            autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
            autogptq_output = tokenizer.decode(autogptq_output[0])

            model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)

            awq_output = model_awq.generate(
                **inp,
                num_beams=1,
                min_new_tokens=30,
                max_new_tokens=30,
            )

            awq_output = tokenizer.decode(awq_output[0])

            self.assertTrue(awq_output == autogptq_output)

    def test_generation_exllama(self):
        if AutoAWQForCausalLM is None:
            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")

        device = torch.device("cuda:0")
        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"

        model_autogptq = AutoGPTQForCausalLM.from_quantized(
            quant_path,
            device=device,
            use_triton=False,
            inject_fused_attention=False,
            inject_fused_mlp=False,
            disable_exllama=False,
            disable_exllamav2=True,
            torch_dtype=torch.float16,
        )
        tokenizer = AutoTokenizer.from_pretrained(quant_path)

        prompt = "I am in Paris and I am going to see the"

        inp = tokenizer(prompt, return_tensors="pt").to(device)

        for name, submodule in model_autogptq.named_modules():
            if isinstance(submodule, CudaOldQLinear):
                # Just a hack to test the handmade pytorch implementation path.
                submodule.autogptq_cuda_available = False

        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
        autogptq_output = tokenizer.decode(autogptq_output[0])

        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)

        awq_output = model_awq.generate(
            **inp,
            num_beams=1,
            min_new_tokens=30,
            max_new_tokens=30,
        )

        awq_output = tokenizer.decode(awq_output[0])

        self.assertTrue(awq_output == autogptq_output)

    def test_generation_exllamav2(self):
        if AutoAWQForCausalLM is None:
            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")

        device = torch.device("cuda:0")
        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"

        model_autogptq = AutoGPTQForCausalLM.from_quantized(
            quant_path,
            device=device,
            use_triton=False,
            inject_fused_attention=False,
            inject_fused_mlp=False,
            torch_dtype=torch.float16,
        )
        tokenizer = AutoTokenizer.from_pretrained(quant_path)

        prompt = "I am in Paris and I am going to see the"

        inp = tokenizer(prompt, return_tensors="pt").to(device)

        for name, submodule in model_autogptq.named_modules():
            if isinstance(submodule, CudaOldQLinear):
                # Just a hack to test the handmade pytorch implementation path.
                submodule.autogptq_cuda_available = False

        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
        autogptq_output = tokenizer.decode(autogptq_output[0])

        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)

        awq_output = model_awq.generate(
            **inp,
            num_beams=1,
            min_new_tokens=30,
            max_new_tokens=30,
        )

        awq_output = tokenizer.decode(awq_output[0])

        self.assertTrue(awq_output == autogptq_output)
team-dmm / auto_gptq python

Products

About

Resources

Contact Gemfury