Publications | Westlake ENCODE Lab

2026

ICML

Which Heads Matter for Reasoning? RL-Guided KV Cache Compression

Wenjie Du, Li Jiang, Keda Tao, Xue Liu, and Huan Wang

In International Conference on Machine Learning (ICML), 2026

@inproceedings{du2026rlkv,
  title = {Which Heads Matter for Reasoning? RL-Guided KV Cache Compression},
  author = {Du, Wenjie and Jiang, Li and Tao, Keda and Liu, Xue and Wang, Huan},
  booktitle = {International Conference on Machine Learning (ICML)},
  year = {2026},
}

ICML

SparseSSM: Efficient Selective Structured State Space Models Can Be Pruned in One-Shot

Kaiwen Tuo and Huan Wang

In International Conference on Machine Learning (ICML), 2026

arXiv Bib PDF Code Website

@inproceedings{tuo2026sparsessm,
  title = {SparseSSM: Efficient Selective Structured State Space Models Can Be Pruned in One-Shot},
  author = {Tuo, Kaiwen and Wang, Huan},
  booktitle = {International Conference on Machine Learning (ICML)},
  year = {2026},
}

CVPR

OmniZip: Audio-Guided Dynamic Token Compression for Fast Omnimodal Large Language Models

Keda Tao, Kele Shao, Bohan Yu, Weiqiang Wang, Jian Liu, and Huan Wang

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2026

arXiv Bib PDF Code

@inproceedings{tao2026omnizip,
  title = {OmniZip: Audio-Guided Dynamic Token Compression for Fast Omnimodal Large Language Models},
  author = {Tao, Keda and Shao, Kele and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Wang, Huan},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2026},
}

CVPR

StreamingTOM: Streaming Token Compression for Efficient Video Understanding

Xueyi Chen, Keda Tao, Kele Shao, and Huan Wang

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2026

arXiv Bib PDF Code Website

@inproceedings{chen2026streamingtom,
  title = {StreamingTOM: Streaming Token Compression for Efficient Video Understanding},
  author = {Chen, Xueyi and Tao, Keda and Shao, Kele and Wang, Huan},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2026},
}

CVPR

Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps

Sicheng Feng, Song Wang, Shuyi Ouyang, Lingdong Kong, Zikai Song, Jianke Zhu, Huan Wang, and Xinchao Wang

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2026

arXiv Bib PDF Code Website

@inproceedings{feng2026reasonmap,
  title = {Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps},
  author = {Feng, Sicheng and Wang, Song and Ouyang, Shuyi and Kong, Lingdong and Song, Zikai and Zhu, Jianke and Wang, Huan and Wang, Xinchao},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2026},
}

ICLR

MergeMix: A Unified Augmentation Paradigm for Visual and Multi-Modal Understanding

Xin Jin, Siyuan Li, Siyong Jian, Kai Yu, and Huan Wang

In International Conference on Learning Representations (ICLR), 2026

arXiv Bib PDF Code Website

@inproceedings{jin2026mergemix,
  title = {MergeMix: A Unified Augmentation Paradigm for Visual and Multi-Modal Understanding},
  author = {Jin, Xin and Li, Siyuan and Jian, Siyong and Yu, Kai and Wang, Huan},
  booktitle = {International Conference on Learning Representations (ICLR)},
  year = {2026},
}

ICLR

OBS-Diff: Accurate Pruning For Diffusion Models in One-Shot

Junhan Zhu, Hesong Wang, Mingluo Su, Zefang Wang, and Huan Wang

In International Conference on Learning Representations (ICLR), 2026

arXiv Bib PDF Code Website

@inproceedings{zhu2026obsdiff,
  title = {OBS-Diff: Accurate Pruning For Diffusion Models in One-Shot},
  author = {Zhu, Junhan and Wang, Hesong and Su, Mingluo and Wang, Zefang and Wang, Huan},
  booktitle = {International Conference on Learning Representations (ICLR)},
  year = {2026},
}

ICLR

RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning

Sicheng Feng, Kaiwen Tuo, Song Wang, Lingdong Kong, Jianke Zhu, and Huan Wang

In International Conference on Learning Representations (ICLR), 2026

arXiv Bib PDF Code Website

@inproceedings{feng2026rewardmap,
  title = {RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning},
  author = {Feng, Sicheng and Tuo, Kaiwen and Wang, Song and Kong, Lingdong and Zhu, Jianke and Wang, Huan},
  booktitle = {International Conference on Learning Representations (ICLR)},
  year = {2026},
}

ICLR

Autoregressive Image Generation with Randomized Parallel Decoding

Haopeng Li, Jinyue Yang, Guoqi Li, and Huan Wang

In International Conference on Learning Representations (ICLR), 2026

arXiv Bib PDF Code Website

@inproceedings{li2026arpg,
  title = {Autoregressive Image Generation with Randomized Parallel Decoding},
  author = {Li, Haopeng and Yang, Jinyue and Li, Guoqi and Wang, Huan},
  booktitle = {International Conference on Learning Representations (ICLR)},
  year = {2026},
}

CPAL Oral

ROSE: Reordered SparseGPT for More Accurate One-Shot Large Language Models Pruning

Mingluo Su and Huan Wang

In Conference on Parsimony and Learning (CPAL), 2026

arXiv Bib PDF Code Website

@inproceedings{su2026rose,
  title = {ROSE: Reordered SparseGPT for More Accurate One-Shot Large Language Models Pruning},
  author = {Su, Mingluo and Wang, Huan},
  booktitle = {Conference on Parsimony and Learning (CPAL)},
  year = {2026},
}

CPAL

ResSVD: Residual Compensated SVD for Large Language Model Compression

Haolei Bai, Siyong Jian, Tuo Liang, Yu Yin, and Huan Wang

In Conference on Parsimony and Learning (CPAL), 2026

arXiv Bib PDF Code

@inproceedings{bai2026ressvd,
  title = {ResSVD: Residual Compensated SVD for Large Language Model Compression},
  author = {Bai, Haolei and Jian, Siyong and Liang, Tuo and Yin, Yu and Wang, Huan},
  booktitle = {Conference on Parsimony and Learning (CPAL)},
  year = {2026},
}

TMLR

When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios

Kele Shao, Keda Tao, Kejia Zhang, Sicheng Feng, Mu Cai, Yuzhang Shang, Haoxuan You, Can Qin, Yang Sui, and Huan Wang

Transactions on Machine Learning Research (TMLR), 2026

arXiv Bib PDF Code

@article{shao2026tokens,
  title = {When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios},
  author = {Shao, Kele and Tao, Keda and Zhang, Kejia and Feng, Sicheng and Cai, Mu and Shang, Yuzhang and You, Haoxuan and Qin, Can and Sui, Yang and Wang, Huan},
  journal = {Transactions on Machine Learning Research (TMLR)},
  year = {2026},
}

arXiv

LVOmniBench: Pioneering Long Audio-Video Understanding Evaluation for Omnimodal LLMs

Keda Tao, Yuhua Zheng, Jia Xu, Wenjie Du, Kele Shao, Hesong Wang, Xueyi Chen, Xin Jin, Junhan Zhu, Bohan Yu, Weiqiang Wang, Jian Liu, Can Qin, Yulun Zhang, Ming-Hsuan Yang, and Huan Wang

arXiv preprint arXiv:2603.19217, 2026

arXiv Bib PDF Code Website

@article{tao2026lvomnibench,
  title = {LVOmniBench: Pioneering Long Audio-Video Understanding Evaluation for Omnimodal LLMs},
  author = {Tao, Keda and Zheng, Yuhua and Xu, Jia and Du, Wenjie and Shao, Kele and Wang, Hesong and Chen, Xueyi and Jin, Xin and Zhu, Junhan and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Qin, Can and Zhang, Yulun and Yang, Ming-Hsuan and Wang, Huan},
  journal = {arXiv preprint arXiv:2603.19217},
  year = {2026},
}

arXiv

MobileKernelBench: Can LLMs Write Efficient Kernels for Mobile Devices?

Xingze Zou, Jing Wang, Yuhua Zheng, Xueyi Chen, Haolei Bai, Lingcheng Kong, Syed A.R. Abu-Bakar, Zhaode Wang, Chengfei Lv, Haoji Hu, and Huan Wang

arXiv preprint arXiv:2603.11935, 2026

arXiv Bib PDF Code Website

@article{zou2026mobilekernelbench,
  title = {MobileKernelBench: Can LLMs Write Efficient Kernels for Mobile Devices?},
  author = {Zou, Xingze and Wang, Jing and Zheng, Yuhua and Chen, Xueyi and Bai, Haolei and Kong, Lingcheng and Abu-Bakar, Syed A.R. and Wang, Zhaode and Lv, Chengfei and Hu, Haoji and Wang, Huan},
  journal = {arXiv preprint arXiv:2603.11935},
  year = {2026},
}

arXiv

DICE: Diffusion Large Language Models Excel at Generating CUDA Kernels

Haolei Bai, Lingcheng Kong, Xueyi Chen, Jianmian Wang, Zhiqiang Tao, and Huan Wang

arXiv preprint arXiv:2602.11715, 2026

arXiv Bib PDF Code Website

@article{bai2026dice,
  title = {DICE: Diffusion Large Language Models Excel at Generating CUDA Kernels},
  author = {Bai, Haolei and Kong, Lingcheng and Chen, Xueyi and Wang, Jianmian and Tao, Zhiqiang and Wang, Huan},
  journal = {arXiv preprint arXiv:2602.11715},
  year = {2026},
}

2025

NeurIPS

HoliTom: Holistic Token Merging for Fast Video Large Language Models

Kele Shao, Keda Tao, Can Qin, Haoxuan You, Yang Sui, and Huan Wang

In Advances in Neural Information Processing Systems (NeurIPS), 2025

arXiv Bib PDF Code Website

@inproceedings{shao2025holitom,
  title = {HoliTom: Holistic Token Merging for Fast Video Large Language Models},
  author = {Shao, Kele and Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year = {2025},
}

NeurIPS

Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs

Kejia Zhang, Keda Tao, Jiasheng Tang, and Huan Wang

In Advances in Neural Information Processing Systems (NeurIPS), 2025

arXiv Bib PDF Code Website

@inproceedings{zhang2025poisoncure,
  title = {Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs},
  author = {Zhang, Kejia and Tao, Keda and Tang, Jiasheng and Wang, Huan},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year = {2025},
}

NeurIPS

FreqExit: Enabling Early-Exit Inference for Visual Autoregressive Models via Frequency-Aware Guidance

Ying Li, Chengfei Lv, and Huan Wang

In Advances in Neural Information Processing Systems (NeurIPS), 2025

Bib PDF Code Website

@inproceedings{li2025freqexit,
  title = {FreqExit: Enabling Early-Exit Inference for Visual Autoregressive Models via Frequency-Aware Guidance},
  author = {Li, Ying and Lv, Chengfei and Wang, Huan},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year = {2025},
}

CVPR

DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models

Keda Tao, Can Qin, Haoxuan You, Yang Sui, and Huan Wang

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025

arXiv Bib PDF Code

@inproceedings{tao2025dycoke,
  title = {DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models},
  author = {Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2025},
}

ICCV

On-Device Diffusion Transformer Policy for Efficient Robot Manipulation

Yiming Wu, Huan Wang, Zhenghao Chen, Jianxin Pang, and Dong Xu

In IEEE/CVF International Conference on Computer Vision (ICCV), 2025

arXiv Bib PDF

@inproceedings{wu2025ondevice,
  title = {On-Device Diffusion Transformer Policy for Efficient Robot Manipulation},
  author = {Wu, Yiming and Wang, Huan and Chen, Zhenghao and Pang, Jianxin and Xu, Dong},
  booktitle = {IEEE/CVF International Conference on Computer Vision (ICCV)},
  year = {2025},
}

TCSVT

Niagara: Normal-Integrated Geometric Affine Field for Scene Reconstruction from a Single View

Xianzu Wu, Zhenxin Ai, Harry Yang, Ser-Nam Lim, Jun Liu, and Huan Wang

IEEE Transactions on Circuits and Systems for Video Technology, 2025

arXiv Bib PDF Code Website

@article{wu2025niagara,
  title = {Niagara: Normal-Integrated Geometric Affine Field for Scene Reconstruction from a Single View},
  author = {Wu, Xianzu and Ai, Zhenxin and Yang, Harry and Lim, Ser-Nam and Liu, Jun and Wang, Huan},
  journal = {IEEE Transactions on Circuits and Systems for Video Technology},
  year = {2025},
}

arXiv

Active Perception Agent for Omnimodal Audio-Video Understanding

Keda Tao, Wenjie Du, Bohan Yu, Weiqiang Wang, Jian Liu, and Huan Wang

arXiv preprint arXiv:2512.23646, 2025

arXiv Bib PDF Code Website

@article{tao2025omniagent,
  title = {Active Perception Agent for Omnimodal Audio-Video Understanding},
  author = {Tao, Keda and Du, Wenjie and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Wang, Huan},
  journal = {arXiv preprint arXiv:2512.23646},
  year = {2025},
}

arXiv

ConCuR: Conciseness Makes State-of-the-Art Kernel Generation

Lingcheng Kong, Jiateng Wei, Hanzhang Shen, and Huan Wang

arXiv preprint arXiv:2510.07356, 2025

arXiv Bib PDF

@article{kong2025concur,
  title = {ConCuR: Conciseness Makes State-of-the-Art Kernel Generation},
  author = {Kong, Lingcheng and Wei, Jiateng and Shen, Hanzhang and Wang, Huan},
  journal = {arXiv preprint arXiv:2510.07356},
  year = {2025},
}

arXiv

Plug-and-Play 1.x-Bit KV Cache Quantization for Video Large Language Models

Keda Tao, Haoxuan You, Yang Sui, Can Qin, and Huan Wang

arXiv preprint arXiv:2503.16257, 2025

arXiv Bib PDF Code

@article{tao2025plugandplay,
  title = {Plug-and-Play 1.x-Bit KV Cache Quantization for Video Large Language Models},
  author = {Tao, Keda and You, Haoxuan and Sui, Yang and Qin, Can and Wang, Huan},
  journal = {arXiv preprint arXiv:2503.16257},
  year = {2025},
}

2024

arXiv

Is Oracle Pruning the True Oracle?

Sicheng Feng, Keda Tao, and Huan Wang

arXiv preprint arXiv:2412.00143, 2024

arXiv Bib PDF Code Website