publications
2025
- COLMAn illusion of progress? assessing the current state of web agentsTianci Xue, Weijian Qi, Tianneng Shi, and 5 more authors2025
As digitalization and cloud technologies evolve, the web is becoming increasingly important in the modern society. Autonomous web agents based on large language models (LLMs) hold a great potential in work automation. It is therefore important to accurately measure and monitor the progression of their capabilities. In this work, we conduct a comprehensive and rigorous assessment of the current state of web agents. Our results depict a very different picture of the competency of current agents, suggesting over-optimism in previously reported results. This gap can be attributed to shortcomings in existing benchmarks. We introduce Online-Mind2Web, an online evaluation benchmark consisting of 300 diverse and realistic tasks spanning 136 websites. It enables us to evaluate web agents under a setting that approximates how real users use these agents. To facilitate more scalable evaluation and development, we also develop a novel LLM-as-a-Judge automatic evaluation method and show that it can achieve around 85% agreement with human judgment, substantially higher than existing methods. Finally, we present the first comprehensive comparative analysis of current web agents, highlighting both their strengths and limitations to inspire future research.
@inprocessdings{xue2025illusion, title = {An illusion of progress? assessing the current state of web agents}, author = {Xue, Tianci and Qi, Weijian and Shi, Tianneng and Song, Chan Hee and Gou, Boyu and Song, Dawn and Sun, Huan and Su, Yu}, booktitle = {First Conference on Language Modeling}, year = {2025}, }
- ArxivMind2Web 2: Evaluating Agentic Search with Agent-as-a-JudgeBoyu Gou, Zanming Huang, Yuting Ning, and 8 more authorsarXiv preprint arXiv:2506.21506, 2025
@article{gou2025mind2web, title = {Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge}, author = {Gou, Boyu and Huang, Zanming and Ning, Yuting and Gu, Yu and Lin, Michael and Qi, Weijian and Kopanev, Andrei and Yu, Botao and Guti{\'e}rrez, Bernal Jim{\'e}nez and Shu, Yiheng and others}, journal = {arXiv preprint arXiv:2506.21506}, year = {2025}, }
2024
- ACLPACIT: Unlocking the Power of Examples for Better In-Context Instruction TuningTianci Xue, Ziqi Wang, Yixia Li, and 2 more authorsIn Findings of the Association for Computational Linguistics: ACL 2024, Aug 2024
Instruction tuning enhances the instruction following ability of large language models by finetuning with supervised instruction data. Previous work proposes in-context instruction tuning (ICIT) where specific positive or negative examples are incorporated into the prompt for better performance. In this work, we propose PACIT, a simple and effective in-context instruction tuning method, inspired by the pedagogical concept of desirable difficulty. The PACIT method unlocks the power of examples by encouraging the model to actively learn to grasp the distinctions between the positive and negative examples instead of merely reading. The model is expected to first verify the correctness of the provided example according to the task description, which is then set as the condition for generating a better response to the task instance. Our extensive experiments prove the effectiveness of PACIT, outperforming ICIT baseline on both in-domain and out-domain tasks up to 9.16 and 3.14 average ROUGE-L scores, respectively. Moreover, PACIT can notably enhance the performance of instruction tuning even when all positive and negative examples are generated with a self-instruct method.
@inproceedings{xue-etal-2024-pacit, title = {{PACIT}: Unlocking the Power of Examples for Better In-Context Instruction Tuning}, author = {Xue, Tianci and Wang, Ziqi and Li, Yixia and Chen, Yun and Chen, Guanhua}, editor = {Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek}, booktitle = {Findings of the Association for Computational Linguistics: ACL 2024}, month = aug, year = {2024}, address = {Bangkok, Thailand}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2024.findings-acl.36/}, doi = {10.18653/v1/2024.findings-acl.36}, pages = {654--665}, }
- ArxivIs your llm secretly a world model of the internet? model-based planning for web agentsYu Gu, Kai Zhang, Yuting Ning, and 8 more authorsarXiv preprint arXiv:2411.06559, Aug 2024
@article{gu2024your, title = {Is your llm secretly a world model of the internet? model-based planning for web agents}, author = {Gu, Yu and Zhang, Kai and Ning, Yuting and Zheng, Boyuan and Gou, Boyu and Xue, Tianci and Chang, Cheng and Srivastava, Sanjari and Xie, Yanan and Qi, Peng and others}, journal = {arXiv preprint arXiv:2411.06559}, year = {2024}, }
2023
- ArxivRcot: Detecting and rectifying factual inconsistency in reasoning by reversing chain-of-thoughtTianci Xue, Ziqi Wang, Zhenhailong Wang, and 3 more authorsarXiv preprint arXiv:2305.11499, Aug 2023
Large language Models (LLMs) have achieved promising performance on arithmetic reasoning tasks by incorporating step-by-step chain-of-thought (CoT) prompting. However, LLMs face challenges in maintaining factual consistency during reasoning, exhibiting tendencies to condition overlooking, question misinterpretation, and condition hallucination over given problems. Existing methods use coarse-grained feedback (e.g., whether the answer is correct) to improve factual consistency. In this work, we propose RCoT (Reversing Chain-of-Thought), a novel method to improve LLMs’ reasoning abilities by automatically detecting and rectifying factual inconsistency in LLMs, generated solutions. To detect factual inconsistency, RCoT first asks LLMs to reconstruct the problem based on generated solutions. Then fine-grained comparisons between the original problem and the reconstructed problem expose the factual inconsistency in the original solutions. To rectify the solution, RCoT formulates detected factual inconsistency into fine-grained feedback to guide LLMs in revising solutions. Experimental results demonstrate improvements of RCoT over standard CoT, Self-Consistency and Self-Refine across seven arithmetic datasets. Moreover, we find that manually written fine-grained feedback can dramatically improve LLMs’ reasoning abilities (e.g., ChatGPT reaches 94.6% accuracy on GSM8K), encouraging the community to further explore the fine-grained feedback generation methods.
@article{xue2023rcot, title = {Rcot: Detecting and rectifying factual inconsistency in reasoning by reversing chain-of-thought}, author = {Xue, Tianci and Wang, Ziqi and Wang, Zhenhailong and Han, Chi and Yu, Pengfei and Ji, Heng}, journal = {arXiv preprint arXiv:2305.11499}, year = {2023}, }
- ArxivParameter-efficient tuning helps language model alignmentTianci Xue, Ziqi Wang, and Heng JiarXiv preprint arXiv:2310.00819, Aug 2023
@article{xue2023parameter, title = {Parameter-efficient tuning helps language model alignment}, author = {Xue, Tianci and Wang, Ziqi and Ji, Heng}, journal = {arXiv preprint arXiv:2310.00819}, year = {2023}, }