在计算机编程中,字符串操作是一种使用频率极高的操作。因此,如何在C++中实现高效的字符串操作是每个程序员必须掌握的技能之一。本文将从多个方面详细阐述如何用C++实现高效的字符串操作。
一、字符串的概念及常见操作
字符串是指多个字符组成的序列,在C++中常用char数组或std::string来表示字符串。常见的字符串操作包括:
1、字符串拼接。将两个或多个字符串连接起来,可以使用+运算符或std::string的append方法。
std::string str1 = "Hello"; std::string str2 = "World"; // 使用+运算符 std::string str3 = str1 + str2; // 使用append方法 str1.append(str2);
2、字符串查找。在一个字符串中查找是否包含另一个字符串,可以使用std::string的find方法。
std::string str = "Hello, world!"; if (str.find("world") != std::string::npos) { std::cout << "Found" << std::endl; } else { std::cout << "Not found" << std::endl; }
3、字符串分割。将一个字符串按照某个分隔符分割成多个子字符串,可以使用std::string的substr和find方法。
std::string str = "Hello,world,!"; std::vector<std::string> substrs; std::string::size_type pos1 = 0, pos2 = 0; while ((pos2 = str.find(",", pos1)) != std::string::npos) { substrs.push_back(str.substr(pos1, pos2 - pos1)); pos1 = pos2 + 1; } substrs.push_back(str.substr(pos1));
二、字符串操作的效率问题
对于常规的字符串操作,直接使用std::string的方法或者使用char数组实现都可以满足要求。但对于需要高效处理字符串的场景,需要考虑字符串操作的效率。
1、字符串拼接。直接使用+运算符或std::string的append方法会导致频繁的动态内存分配和释放,影响性能。可以使用std::stringstream或者char数组来优化。例如使用std::stringstream:
std::string str1 = "Hello"; std::string str2 = "World"; std::stringstream ss; ss << str1 << str2; std::string str3 = ss.str();
使用char数组:
std::string str1 = "Hello"; std::string str2 = "World"; char buf[100]; std::snprintf(buf, sizeof(buf), "%s%s", str1.c_str(), str2.c_str()); std::string str3 = buf;
2、字符串查找。std::string的find方法在查找过程中会使用暴力匹配算法,时间复杂度为O(n*m),n和m分别为两个字符串的长度。可以使用KMP算法或Boyer-Moore算法来优化。例如使用KMP算法:
std::string str = "Hello, world!"; std::string pattern = "world"; std::vector<int> next(pattern.size(), -1); for (int i = 1, j = -1; i < pattern.size(); ++i) { while (j >= 0 && pattern[j + 1] != pattern[i]) { j = next[j]; } if (pattern[j + 1] == pattern[i]) { ++j; } next[i] = j; } for (int i = 0, j = -1; i < str.size(); ++i) { while (j >= 0 && pattern[j + 1] != str[i]) { j = next[j]; } if (pattern[j + 1] == str[i]) { ++j; } if (j == pattern.size() - 1) { std::cout << "Found at " << i - j << std::endl; j = next[j]; } }
3、字符串分割。使用std::string的substr方法和find方法会导致频繁的字符串拷贝和动态内存分配和释放,影响性能。可以使用标准库的std::regex来实现正则表达式匹配,或者手写字符串分割算法。例如使用手写分割算法:
std::string str = "Hello,world,!"; std::vector<std::string> substrs; std::string::size_type pos1 = 0, pos2 = 0; while ((pos2 = str.find(",", pos1)) != std::string::npos) { substrs.push_back(str.substr(pos1, pos2 - pos1)); pos1 = pos2 + 1; } substrs.push_back(str.substr(pos1));
三、常用字符串库的性能比较
不同的字符串库在实现上有所不同,因此在性能上也有所差异。我们在一台64位Ubuntu系统上,使用g++编译器比较了STL库、Boost库和folly库的字符串拼接、查找和分割性能。代码如下:
#include <iostream> #include <chrono> #include <vector> #include <string> #include <sstream> #include <boost/algorithm/string.hpp> #include <boost/lexical_cast.hpp> #include <folly/String.h> int main() { // 字符串拼接 std::string str1 = "Hello"; std::string str2 = "World"; { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::stringstream ss; ss << str1 << str2; std::string result = ss.str(); } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "std::stringstream: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { char buf[100]; std::snprintf(buf, sizeof(buf), "%s%s", str1.c_str(), str2.c_str()); std::string result = buf; } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "std::snprintf: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::string result = str1 + str2; } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "std::string: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::string result = str1.append(str2); } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "std::string::append: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::string result = folly::to<std::string>(str1, str2); } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "folly::to: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } // 字符串查找 std::string str = "Hello, world!"; std::string pattern = "world"; { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::size_t pos = str.find(pattern); if (pos != std::string::npos) { std::string result = str.substr(pos); } } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "std::string::find and substr: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::vector<std::string> substrs; boost::split(substrs, str, boost::is_any_of(",")); for (const auto& substr : substrs) { if (substr == pattern) { std::string result = substr; } } } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "boost::split and for loop: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::vector<std::string> substrs; folly::split(",", str, substrs); for (const auto& substr : substrs) { if (substr == pattern) { std::string result = substr; } } } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "folly::split and for loop: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } // 字符串转换 { std::string str1 = "12345"; std::string str2 = "67890"; { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::string result = str1 + str2; int value = std::stoi(result); } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "std::stoi: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl; } { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); for (int i = 0; i < 1000000; ++i) { std::string result = str1 + str2; int value = boost::lexical_cast<int>(result); } std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "boost::lexical_cast: