Adding trace analysis in preparation for page allocator microbenchmark.

This commit is contained in:
guangli-dai 2025-08-24 18:56:12 -07:00
parent 2ffb17fd3e
commit 301b69ea27
5 changed files with 472 additions and 0 deletions

1
.gitignore vendored
View file

@ -73,6 +73,7 @@ test/include/test/jemalloc_test_defs.h
/test/stress/[A-Za-z]*
!/test/stress/[A-Za-z]*.*
!/test/stress/pa/
/test/stress/*.[od]
/test/stress/*.out

View file

@ -566,6 +566,24 @@ $(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TE
@mkdir -p $(@D)
$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
$(objroot)test/stress/pa/pa_data_preprocessor$(EXE): $(objroot)test/stress/pa/pa_data_preprocessor.$(O)
@mkdir -p $(@D)
$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
$(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.c
@mkdir -p $(@D)
$(CC) $(CFLAGS) -c $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
ifdef CC_MM
@$(CC) -MM $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
endif
$(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.cpp
@mkdir -p $(@D)
$(CXX) $(CXXFLAGS) -c $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
ifdef CC_MM
@$(CXX) -MM $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
endif
build_lib_shared: $(DSOS)
build_lib_static: $(STATIC_LIBS)
ifeq ($(enable_shared), 1)
@ -686,6 +704,7 @@ tests_unit: $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%$(EXE))
tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
tests_analyze: $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%$(EXE))
tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
tests_pa: $(objroot)test/stress/pa/pa_data_preprocessor$(EXE)
tests: tests_unit tests_integration tests_analyze tests_stress
check_unit_dir:

23
test/stress/pa/.gitignore vendored Normal file
View file

@ -0,0 +1,23 @@
# Ignore executable files
pa_microbench
pa_data_preprocessor
# Ignore object files
*.o
*.d
# Ignore temporary and backup files
*~
*.tmp
*.bak
# Ignore compiled output files
*.out
# Keep source files and documentation
!*.c
!*.cpp
!*.h
!*.md
!*.sh
!Makefile*

6
test/stress/pa/data/.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
# Ignore data files
*.csv
# But keep example files
!example_*.csv
!*.md

View file

@ -0,0 +1,423 @@
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <cstdint>
#include <cassert>
/*
* Page Allocator Data Preprocessor (C++ Version)
*
* This tool processes real allocation traces (collected via BPF)
* and converts them into a format suitable for the PA simulator.
*
* Supported input formats:
* HPA: shard_ind_int,addr_int,nsecs_int,probe,size_int
* SEC: process_id,thread_id,thread_name,nsecs_int,_c4,sec_ptr_int,sec_shard_ptr_int,edata_ptr_int,size_int,is_frequent_reuse_int
*
* Output format (4 columns):
* shard_ind_int,operation_index,size_or_alloc_index,is_frequent
* where:
* - shard_ind_int: shard index as integer
* - operation_index: 0=alloc, 1=dalloc
* - size_or_alloc_index: for alloc operations show bytes,
* for dalloc operations show index of corresponding alloc
* - is_frequent: 1 if frequent reuse allocation, 0 otherwise
*/
enum class TraceFormat { HPA, SEC };
struct TraceEvent {
int shard_ind;
uintptr_t addr;
uint64_t nsecs;
std::string probe;
size_t size;
bool is_frequent;
};
struct AllocationRecord {
uintptr_t addr;
size_t size;
int shard_ind;
size_t alloc_index;
uint64_t nsecs;
};
class AllocationTracker {
private:
std::unordered_map<uintptr_t, AllocationRecord> records_;
public:
void
add_allocation(uintptr_t addr, size_t size, int shard_ind,
size_t alloc_index, uint64_t nsecs) {
records_[addr] = {addr, size, shard_ind, alloc_index, nsecs};
}
AllocationRecord *
find_allocation(uintptr_t addr) {
auto it = records_.find(addr);
return (it != records_.end()) ? &it->second : nullptr;
}
void
remove_allocation(uintptr_t addr) {
records_.erase(addr);
}
size_t
count() const {
return records_.size();
}
};
class ArenaMapper {
private:
std::unordered_map<uintptr_t, int> sec_ptr_to_arena_;
int next_arena_index_;
public:
ArenaMapper() : next_arena_index_(0) {}
int
get_arena_index(uintptr_t sec_ptr) {
if (sec_ptr == 0) {
/* Should not be seeing null sec pointer anywhere. Use this as a sanity check.*/
return 0;
}
auto it = sec_ptr_to_arena_.find(sec_ptr);
if (it != sec_ptr_to_arena_.end()) {
return it->second;
}
/* New sec_ptr, assign next available arena index */
int arena_index = next_arena_index_++;
sec_ptr_to_arena_[sec_ptr] = arena_index;
return arena_index;
}
size_t
arena_count() const {
return sec_ptr_to_arena_.size();
}
};
bool
is_alloc_operation(const std::string &probe) {
return (probe == "hpa_alloc" || probe == "sec_alloc");
}
bool
is_dalloc_operation(const std::string &probe) {
return (probe == "hpa_dalloc" || probe == "sec_dalloc");
}
bool
parse_hpa_line(const std::string &line, TraceEvent &event) {
std::istringstream ss(line);
std::string token;
/* Parse shard_ind_int */
if (!std::getline(ss, token, ',')) {
return true;
}
event.shard_ind = std::stoi(token);
/* Parse addr_int */
if (!std::getline(ss, token, ',')) {
return true;
}
event.addr = std::stoull(token);
/* Parse nsecs_int */
if (!std::getline(ss, token, ',')) {
return true;
}
event.nsecs = std::stoull(token);
/* Parse probe */
if (!std::getline(ss, token, ',')) {
return true;
}
event.probe = token;
/* Parse size_int */
if (!std::getline(ss, token, ',')) {
return true;
}
event.size = std::stoull(token);
/* HPA format doesn't have is_frequent field, set default */
event.is_frequent = true;
return false;
}
bool
parse_sec_line(
const std::string &line, TraceEvent &event, ArenaMapper &arena_mapper) {
std::istringstream ss(line);
std::string token;
/* Skip process_id */
if (!std::getline(ss, token, ',')) {
return true;
}
/* Skip thread_id */
if (!std::getline(ss, token, ',')) {
return true;
}
/* Skip thread_name */
if (!std::getline(ss, token, ',')) {
return true;
}
/* Parse nsecs_int */
if (!std::getline(ss, token, ',')) {
return true;
}
event.nsecs = std::stoull(token);
/* Parse operation */
if (!std::getline(ss, token, ',')) {
return true;
}
event.probe = token;
/* Parse sec_ptr_int (used for arena mapping) */
uintptr_t sec_ptr = 0;
if (!std::getline(ss, token, ',')) {
return true;
}
if (!token.empty()) {
sec_ptr = std::stoull(token);
}
/* Map sec_ptr to arena index */
event.shard_ind = arena_mapper.get_arena_index(sec_ptr);
/* Skip sec_shard_ptr_int */
if (!std::getline(ss, token, ',')) {
return true;
}
/* Parse edata_ptr_int (used as the address) */
if (!std::getline(ss, token, ',')) {
return true;
}
if (!token.empty()) {
event.addr = std::stoull(token);
} else {
event.addr = 0;
}
/* Parse size_int */
if (!std::getline(ss, token, ',')
&& !is_dalloc_operation(event.probe)) {
/* SEC format may not always have size for dalloc */
return true;
}
if (!token.empty()) {
event.size = std::stoull(token);
} else {
/* When no size given, this is a dalloc, size won't be used. */
event.size = 0;
}
/* Parse is_frequent_reuse_int */
if (!std::getline(ss, token, ',')
&& !is_dalloc_operation(event.probe)) {
return true;
}
if (!token.empty()) {
event.is_frequent = (std::stoi(token) != 0);
} else {
/*
* When no is_frequent_reuse_int given, this is a dalloc,
* is_frequent won't be used.
*/
event.is_frequent = false;
}
return false;
}
void
write_output_header(std::ofstream &output) {
output << "shard_ind,operation,size_or_alloc_index,is_frequent\n";
}
void
write_output_event(std::ofstream &output, int shard_ind, int operation,
size_t value, bool is_frequent) {
output << shard_ind << "," << operation << "," << value << ","
<< (is_frequent ? 1 : 0) << "\n";
}
size_t
process_trace_file(const std::string &input_filename,
const std::string &output_filename, TraceFormat format) {
std::ifstream input(input_filename);
if (!input.is_open()) {
std::cerr << "Failed to open input file: " << input_filename
<< std::endl;
return 0;
}
std::ofstream output(output_filename);
if (!output.is_open()) {
std::cerr << "Failed to open output file: " << output_filename
<< std::endl;
return 0;
}
AllocationTracker tracker;
ArenaMapper arena_mapper; /* For SEC format arena mapping */
std::string line;
size_t line_count = 0;
size_t output_count = 0;
size_t alloc_sequence = 0; /* Sequential index for allocations */
size_t unmatched_frees = 0;
write_output_header(output);
std::cout << "Reading from: " << input_filename << std::endl;
/* Skip header line */
if (!std::getline(input, line)) {
std::cerr << "Error: Empty input file" << std::endl;
return 0;
}
while (std::getline(input, line)) {
line_count++;
/* Skip empty lines */
if (line.empty()) {
continue;
}
TraceEvent event;
bool parse_error = false;
if (format == TraceFormat::HPA) {
parse_error = parse_hpa_line(line, event);
} else if (format == TraceFormat::SEC) {
parse_error = parse_sec_line(line, event, arena_mapper);
}
if (parse_error) {
continue;
}
if (is_alloc_operation(event.probe)) {
/* This is an allocation */
write_output_event(output, event.shard_ind, 0,
event.size, event.is_frequent);
/* Track this allocation with the current sequence number */
tracker.add_allocation(event.addr, event.size,
event.shard_ind, alloc_sequence, event.nsecs);
alloc_sequence++;
} else if (is_dalloc_operation(event.probe)) {
/* This is a deallocation. Ignore dalloc without a corresponding alloc. */
AllocationRecord *record = tracker.find_allocation(
event.addr);
if (record) {
/* Validate timing: deallocation should happen after allocation */
assert(event.nsecs >= record->nsecs);
/* Found matching allocation with valid timing */
write_output_event(output, event.shard_ind, 1,
record->alloc_index, event.is_frequent);
tracker.remove_allocation(event.addr);
output_count++; /* Count this deallocation */
} else {
unmatched_frees++;
}
} else {
std::cerr << "Unknown operation: " << event.probe
<< std::endl;
}
}
std::cout << "Processed " << line_count << " lines" << std::endl;
std::cout << "Unmatched frees: " << unmatched_frees << std::endl;
std::cout << "Extracted " << output_count << " alloc/dalloc pairs"
<< std::endl;
std::cout << "Results written to: " << output_filename << std::endl;
return output_count;
}
TraceFormat
parse_format(const std::string &format_str) {
if (format_str == "hpa") {
return TraceFormat::HPA;
} else if (format_str == "sec") {
return TraceFormat::SEC;
} else {
throw std::invalid_argument(
"Unknown format: " + format_str + ". Use 'hpa' or 'sec'");
}
}
int
main(int argc, char *argv[]) {
if (argc < 4 || argc > 5) {
std::cerr << "Usage: " << argv[0]
<< " <format> <input_csv_file> <output_file>"
<< std::endl;
std::cerr << std::endl;
std::cerr << "Arguments:" << std::endl;
std::cerr << " format - Input format: 'hpa' or 'sec'"
<< std::endl;
std::cerr
<< " hpa: shard_ind_int,addr_int,nsecs_int,probe,size_int"
<< std::endl;
std::cerr
<< " sec: process_id,thread_id,thread_name,nsecs_int,_c4,sec_ptr_int,sec_shard_ptr_int,edata_ptr_int,size_int,is_frequent_reuse_int"
<< std::endl;
std::cerr << " input_csv_file - Input CSV trace file"
<< std::endl;
std::cerr
<< " output_file - Output file for simulator with format:"
<< std::endl;
std::cerr
<< " shard_ind,operation,size_or_alloc_index,is_frequent"
<< std::endl;
std::cerr << std::endl;
std::cerr << "Output format:" << std::endl;
std::cerr << " - operation: 0=alloc, 1=dalloc" << std::endl;
std::cerr
<< " - size_or_alloc_index: bytes for alloc, alloc index for dalloc"
<< std::endl;
return 1;
}
try {
TraceFormat format = parse_format(argv[1]);
std::string input_file = argv[2];
std::string output_file = argv[3];
size_t events_generated = process_trace_file(
input_file, output_file, format);
if (events_generated == 0) {
std::cerr
<< "No events generated. Check input file format and filtering criteria."
<< std::endl;
return 1;
}
return 0;
} catch (const std::exception &e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
}