From: Nai y. z. <zha...@gm...> - 2012-02-10 08:01:29
|
Greetings, Could anybody help me a little out of my difficulty? I have a SSD and I am trying to use it to simulate my program I/O performance, however, IOPS calculated from my program is much much faster than IOMeter. My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read IOPS is around 94k (queue depth is 32). However my program (32 windows threads) can reach around 500k 512B IOPS, around 5 times of IOMeter!!! I did data validation but didn't find any error in data fetching. It's because my data fetching in order? I paste my code belwo (it mainly fetch 512B from file and release it; I did use 4bytes (an int) to validate program logic and didn't find problem), can anybody help me figure out where I am wrong? Thanks so much in advance!! Nai Yan. #include <stdio.h> #include <Windows.h> /* ** Purpose: Verify file random read IOPS in comparison with IOMeter ** Author: Nai Yan ** Date: Feb. 9th, 2012 **/ //Global variables long completeIOs = 0; long completeBytes = 0; int threadCount = 32; unsigned long long length = 1073741824; //4G test file int interval = 1024; int resultArrayLen = 320000; int *result = new int[resultArrayLen]; //Method declarison double GetSecs(void); //Calculate out duration int InitPool(long long,char*,int); //Initialize test data for testing, if successful, return 1; otherwise, return a non 1 value. int * FileRead(char * path); unsigned int DataVerification(int*, int sampleItem); //Verify data fetched from pool int main() { int sampleItem = 0x1; char * fPath = "G:\\workspace\\4G.bin"; unsigned int invalidIO = 0; if (InitPool(length,fPath,sampleItem)!= 1) printf("File write err... \n"); //start do random I/Os from initialized file double start = GetSecs(); int * fetchResult = FileRead(fPath); double end = GetSecs(); printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - start)); //start data validation, for 4 bytes fetch only // invalidIO = DataVerification(fetchResult,sampleItem); // if (invalidIO !=0) // { // printf("Total invalid data fetch IOs are %d", invalidIO); // } return 0; } int InitPool(long long length, char* path, int sample) { printf("Start initializing test data ... \n"); FILE * fp = fopen(path,"wb"); if (fp == NULL) { printf("file open err... \n"); exit (-1); } else //initialize file for testing { fseek(fp,0L,SEEK_SET); for (int i=0; i<length; i++) { fwrite(&sample,sizeof(int),1,fp); } fclose(fp); fp = NULL; printf("Data initialization is complete...\n"); return 1; } } double GetSecs(void) { LARGE_INTEGER frequency; LARGE_INTEGER start; if(! QueryPerformanceFrequency(&frequency)) printf("QueryPerformanceFrequency Failed\n"); if(! QueryPerformanceCounter(&start)) printf("QueryPerformanceCounter Failed\n"); return ((double)start.QuadPart/(double)frequency.QuadPart); } class input { public: char *path; int starting; input (int st, char * filePath):starting(st),path(filePath){} }; //Workers DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) { input * in = (input*) lpThreadParameter; char* path = in->path; FILE * fp = fopen(path,"rb"); int sPos = in->starting; // int * result = in->r; if(fp != NULL) { fpos_t pos; for (int i=0; i<resultArrayLen/threadCount;i++) { pos = i * interval; fsetpos(fp,&pos); //For 512 bytes fetch each time unsigned char *c =new unsigned char [512]; if (fread(c,512,1,fp) ==1) { InterlockedIncrement(&completeIOs); delete c; } //For 4 bytes fetch each time /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) { InterlockedIncrement(&completeIOs); }*/ else { printf("file read err...\n"); exit(-1); } } fclose(fp); fp = NULL; } else { printf("File open err... \n"); exit(-1); } } int * FileRead(char * p) { printf("Starting reading file ... \n"); HANDLE mWorkThread[256]; //max 256 threads completeIOs = 0; int slice = int (resultArrayLen/threadCount); for(int i = 0; i < threadCount; i++) { mWorkThread[i] = CreateThread( NULL, 0, FileReadThreadEntry, (LPVOID)(new input(i*slice,p)), 0, NULL); } WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE); printf("File read complete... \n"); return result; } unsigned int DataVerification(int* result, int sampleItem) { unsigned int invalid = 0; for (int i=0; i< resultArrayLen/interval;i++) { if (result[i]!=sampleItem) { invalid ++; continue; } } return invalid; } |
From: <jo...@ei...> - 2012-02-10 14:34:50
|
Forgive me if I missed it, but I don't see any randomization in your file reads. It looks like you just skip ahead so thread 0 reads the first 512bytes, thread 1 the next 512b. So any storage will be prefetching very effectively. Tell Iometer to do sequential instead of random and see how much closer the numbers are. Or better yet, make your program randomize its reads over the entire disk. Joe Quoting Nai yan zhao <zha...@gm...>: > Greetings, > Could anybody help me a little out of my difficulty? > > I have a SSD and I am trying to use it to simulate my program I/O > performance, however, IOPS calculated from my program is much much faster > than IOMeter. > > My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read > IOPS is around 94k (queue depth is 32). > However my program (32 windows threads) can reach around 500k 512B > IOPS, around 5 times of IOMeter!!! I did data validation but didn't find > any error in data fetching. It's because my data fetching in order? > > I paste my code belwo (it mainly fetch 512B from file and release it; > I did use 4bytes (an int) to validate program logic and didn't find > problem), can anybody help me figure out where I am wrong? > > Thanks so much in advance!! > > Nai Yan. > > #include <stdio.h> > #include <Windows.h> > /* > ** Purpose: Verify file random read IOPS in comparison with IOMeter > ** Author: Nai Yan > ** Date: Feb. 9th, 2012 > **/ > //Global variables > long completeIOs = 0; > long completeBytes = 0; > int threadCount = 32; > unsigned long long length = 1073741824; //4G test file > int interval = 1024; > int resultArrayLen = 320000; > int *result = new int[resultArrayLen]; > //Method declarison > double GetSecs(void); //Calculate out duration > int InitPool(long long,char*,int); //Initialize test data for > testing, if successful, return 1; otherwise, return a non 1 value. > int * FileRead(char * path); > unsigned int DataVerification(int*, int sampleItem); > //Verify data fetched from pool > int main() > { > int sampleItem = 0x1; > char * fPath = "G:\\workspace\\4G.bin"; > unsigned int invalidIO = 0; > if (InitPool(length,fPath,sampleItem)!= 1) > printf("File write err... \n"); > //start do random I/Os from initialized file > double start = GetSecs(); > int * fetchResult = FileRead(fPath); > double end = GetSecs(); > printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - start)); > //start data validation, for 4 bytes fetch only > // invalidIO = DataVerification(fetchResult,sampleItem); > // if (invalidIO !=0) > // { > // printf("Total invalid data fetch IOs are %d", invalidIO); > // } > return 0; > } > > > int InitPool(long long length, char* path, int sample) > { > printf("Start initializing test data ... \n"); > FILE * fp = fopen(path,"wb"); > if (fp == NULL) > { > printf("file open err... \n"); > exit (-1); > } > else //initialize file for testing > { > fseek(fp,0L,SEEK_SET); > for (int i=0; i<length; i++) > { > fwrite(&sample,sizeof(int),1,fp); > } > fclose(fp); > fp = NULL; > printf("Data initialization is complete...\n"); > return 1; > } > } > double GetSecs(void) > { > LARGE_INTEGER frequency; > LARGE_INTEGER start; > if(! QueryPerformanceFrequency(&frequency)) > printf("QueryPerformanceFrequency Failed\n"); > if(! QueryPerformanceCounter(&start)) > printf("QueryPerformanceCounter Failed\n"); > return ((double)start.QuadPart/(double)frequency.QuadPart); > } > class input > { > public: > char *path; > int starting; > input (int st, char * filePath):starting(st),path(filePath){} > }; > //Workers > DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) > { > input * in = (input*) lpThreadParameter; > char* path = in->path; > FILE * fp = fopen(path,"rb"); > int sPos = in->starting; > // int * result = in->r; > if(fp != NULL) > { > fpos_t pos; > for (int i=0; i<resultArrayLen/threadCount;i++) > { > pos = i * interval; > fsetpos(fp,&pos); > //For 512 bytes fetch each time > unsigned char *c =new unsigned char [512]; > if (fread(c,512,1,fp) ==1) > { > InterlockedIncrement(&completeIOs); > delete c; > } > //For 4 bytes fetch each time > /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) > { > InterlockedIncrement(&completeIOs); > }*/ > else > { > printf("file read err...\n"); > exit(-1); > } > } > fclose(fp); > fp = NULL; > } > else > { > printf("File open err... \n"); > exit(-1); > } > } > int * FileRead(char * p) > { > printf("Starting reading file ... \n"); > HANDLE mWorkThread[256]; //max 256 threads > completeIOs = 0; > int slice = int (resultArrayLen/threadCount); > for(int i = 0; i < threadCount; i++) > { > mWorkThread[i] = CreateThread( > NULL, > 0, > FileReadThreadEntry, > (LPVOID)(new input(i*slice,p)), > 0, > NULL); > } > WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE); > printf("File read complete... \n"); > return result; > } > unsigned int DataVerification(int* result, int sampleItem) > { > unsigned int invalid = 0; > for (int i=0; i< resultArrayLen/interval;i++) > { > if (result[i]!=sampleItem) > { > invalid ++; > continue; > } > } > return invalid; > } > |
From: Fabian T. <fa...@ti...> - 2012-02-10 16:30:42
|
If I read the test correctly, all threads start at offset 0, and then perform 512b reads with a 1024b stride between reads. As Joe said, this is pretty much sequential reading, and all threads are reading the same data, so most are likely to be satisifed from cache, either in the OS or on the SSD itself. They'll do 320000/16=20000 IO operations total each, so end up reading 20MB of the file. It's quite likely that the whole 20MB that you are reading will sit happilly in the file cache. Create an access pattern that mimics your app (512b sequential with 1024b stride), create 32 workers, and see if the results are similar. Best would be if you created a test file of 20MB, too. You can then see how things compare if you go with async I/O and a single thread. Cheers, -Fab On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: > Forgive me if I missed it, but I don't see any randomization in your > file reads. > > It looks like you just skip ahead so thread 0 reads the first > 512bytes, thread 1 the next 512b. So any storage will be prefetching > very effectively. > > Tell Iometer to do sequential instead of random and see how much > closer the numbers are. Or better yet, make your program randomize > its reads over the entire disk. > > Joe > > > Quoting Nai yan zhao <zha...@gm...>: > >> Greetings, >> Could anybody help me a little out of my difficulty? >> >> I have a SSD and I am trying to use it to simulate my program I/O >> performance, however, IOPS calculated from my program is much much faster >> than IOMeter. >> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >> IOPS is around 94k (queue depth is 32). >> However my program (32 windows threads) can reach around 500k 512B >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't find >> any error in data fetching. It's because my data fetching in order? >> >> I paste my code belwo (it mainly fetch 512B from file and release it; >> I did use 4bytes (an int) to validate program logic and didn't find >> problem), can anybody help me figure out where I am wrong? >> >> Thanks so much in advance!! >> >> Nai Yan. >> >> #include <stdio.h> >> #include <Windows.h> >> /* >> ** Purpose: Verify file random read IOPS in comparison with IOMeter >> ** Author: Nai Yan >> ** Date: Feb. 9th, 2012 >> **/ >> //Global variables >> long completeIOs = 0; >> long completeBytes = 0; >> int threadCount = 32; >> unsigned long long length = 1073741824; //4G test file >> int interval = 1024; >> int resultArrayLen = 320000; >> int *result = new int[resultArrayLen]; >> //Method declarison >> double GetSecs(void); //Calculate out duration >> int InitPool(long long,char*,int); //Initialize test data for >> testing, if successful, return 1; otherwise, return a non 1 value. >> int * FileRead(char * path); >> unsigned int DataVerification(int*, int sampleItem); >> //Verify data fetched from pool >> int main() >> { >> int sampleItem = 0x1; >> char * fPath = "G:\\workspace\\4G.bin"; >> unsigned int invalidIO = 0; >> if (InitPool(length,fPath,sampleItem)!= 1) >> printf("File write err... \n"); >> //start do random I/Os from initialized file >> double start = GetSecs(); >> int * fetchResult = FileRead(fPath); >> double end = GetSecs(); >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - start)); >> //start data validation, for 4 bytes fetch only >> // invalidIO = DataVerification(fetchResult,sampleItem); >> // if (invalidIO !=0) >> // { >> // printf("Total invalid data fetch IOs are %d", invalidIO); >> // } >> return 0; >> } >> >> >> int InitPool(long long length, char* path, int sample) >> { >> printf("Start initializing test data ... \n"); >> FILE * fp = fopen(path,"wb"); >> if (fp == NULL) >> { >> printf("file open err... \n"); >> exit (-1); >> } >> else //initialize file for testing >> { >> fseek(fp,0L,SEEK_SET); >> for (int i=0; i<length; i++) >> { >> fwrite(&sample,sizeof(int),1,fp); >> } >> fclose(fp); >> fp = NULL; >> printf("Data initialization is complete...\n"); >> return 1; >> } >> } >> double GetSecs(void) >> { >> LARGE_INTEGER frequency; >> LARGE_INTEGER start; >> if(! QueryPerformanceFrequency(&frequency)) >> printf("QueryPerformanceFrequency Failed\n"); >> if(! QueryPerformanceCounter(&start)) >> printf("QueryPerformanceCounter Failed\n"); >> return ((double)start.QuadPart/(double)frequency.QuadPart); >> } >> class input >> { >> public: >> char *path; >> int starting; >> input (int st, char * filePath):starting(st),path(filePath){} >> }; >> //Workers >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >> { >> input * in = (input*) lpThreadParameter; >> char* path = in->path; >> FILE * fp = fopen(path,"rb"); >> int sPos = in->starting; >> // int * result = in->r; >> if(fp != NULL) >> { >> fpos_t pos; >> for (int i=0; i<resultArrayLen/threadCount;i++) >> { >> pos = i * interval; >> fsetpos(fp,&pos); >> //For 512 bytes fetch each time >> unsigned char *c =new unsigned char [512]; >> if (fread(c,512,1,fp) ==1) >> { >> InterlockedIncrement(&completeIOs); >> delete c; >> } >> //For 4 bytes fetch each time >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >> { >> InterlockedIncrement(&completeIOs); >> }*/ >> else >> { >> printf("file read err...\n"); >> exit(-1); >> } >> } >> fclose(fp); >> fp = NULL; >> } >> else >> { >> printf("File open err... \n"); >> exit(-1); >> } >> } >> int * FileRead(char * p) >> { >> printf("Starting reading file ... \n"); >> HANDLE mWorkThread[256]; //max 256 threads >> completeIOs = 0; >> int slice = int (resultArrayLen/threadCount); >> for(int i = 0; i < threadCount; i++) >> { >> mWorkThread[i] = CreateThread( >> NULL, >> 0, >> FileReadThreadEntry, >> (LPVOID)(new input(i*slice,p)), >> 0, >> NULL); >> } >> WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE); >> printf("File read complete... \n"); >> return result; >> } >> unsigned int DataVerification(int* result, int sampleItem) >> { >> unsigned int invalid = 0; >> for (int i=0; i< resultArrayLen/interval;i++) >> { >> if (result[i]!=sampleItem) >> { >> invalid ++; >> continue; >> } >> } >> return invalid; >> } >> > > > > > ------------------------------------------------------------------------------ > Virtualization & Cloud Management Using Capacity Planning > Cloud computing makes use of virtualization - but cloud computing > also focuses on allowing computing to be delivered as a service. > http://www.accelacomm.com/jaw/sfnl/114/51521223/ > _______________________________________________ > Iometer-devel mailing list > Iom...@li... > https://lists.sourceforge.net/lists/listinfo/iometer-devel |
From: Nai y. z. <zha...@gm...> - 2012-02-12 14:17:42
|
Hello Fabian and Joe, Thank you so much for your reply. Actually, what I am trying to do, is to split a file into 32 parts, and each part will be assigned to a thread to read. Each thread each time to open file, read 512B, and close file. I was trying to avoid 2 read I/Os hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most read I/Os are ordered but not contiguous<http://en.wikipedia.org/wiki/Contiguity#Computer_science> . By your suggestion, I tried 512B sequential I/O with settings below, Max disk size - 8388608 # of Outstanding I/O - 32 (for 64, it's also around 82K) Transfer request size - 512B, 100% sequential Reply size - no reply Align I/Os on - Sector boundaries The result is around 82K, still much slower than my program. If my program has any defect in calculating IOPS? Or if I have any misunderstanding of caching of SSD or file system, which causes my program fetches data most from RAM of SSD? Or what parameters I should set in I/O meter to simulate my program I/O? Thank you again in advance for your time to help investigate it!! Nai Yan. 2012/2/11 Fabian Tillier <fa...@ti...> > If I read the test correctly, all threads start at offset 0, and then > perform 512b reads with a 1024b stride between reads. As Joe said, > this is pretty much sequential reading, and all threads are reading > the same data, so most are likely to be satisifed from cache, either > in the OS or on the SSD itself. They'll do 320000/16=20000 IO > operations total each, so end up reading 20MB of the file. It's quite > likely that the whole 20MB that you are reading will sit happilly in > the file cache. > > Create an access pattern that mimics your app (512b sequential with > 1024b stride), create 32 workers, and see if the results are similar. > Best would be if you created a test file of 20MB, too. You can then > see how things compare if you go with async I/O and a single thread. > > Cheers, > -Fab > > On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: > > Forgive me if I missed it, but I don't see any randomization in your > > file reads. > > > > It looks like you just skip ahead so thread 0 reads the first > > 512bytes, thread 1 the next 512b. So any storage will be prefetching > > very effectively. > > > > Tell Iometer to do sequential instead of random and see how much > > closer the numbers are. Or better yet, make your program randomize > > its reads over the entire disk. > > > > Joe > > > > > > Quoting Nai yan zhao <zha...@gm...>: > > > >> Greetings, > >> Could anybody help me a little out of my difficulty? > >> > >> I have a SSD and I am trying to use it to simulate my program I/O > >> performance, however, IOPS calculated from my program is much much > faster > >> than IOMeter. > >> > >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read > >> IOPS is around 94k (queue depth is 32). > >> However my program (32 windows threads) can reach around 500k 512B > >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't find > >> any error in data fetching. It's because my data fetching in order? > >> > >> I paste my code belwo (it mainly fetch 512B from file and release > it; > >> I did use 4bytes (an int) to validate program logic and didn't find > >> problem), can anybody help me figure out where I am wrong? > >> > >> Thanks so much in advance!! > >> > >> Nai Yan. > >> > >> #include <stdio.h> > >> #include <Windows.h> > >> /* > >> ** Purpose: Verify file random read IOPS in comparison with IOMeter > >> ** Author: Nai Yan > >> ** Date: Feb. 9th, 2012 > >> **/ > >> //Global variables > >> long completeIOs = 0; > >> long completeBytes = 0; > >> int threadCount = 32; > >> unsigned long long length = 1073741824; //4G test file > >> int interval = 1024; > >> int resultArrayLen = 320000; > >> int *result = new int[resultArrayLen]; > >> //Method declarison > >> double GetSecs(void); //Calculate out duration > >> int InitPool(long long,char*,int); //Initialize test data for > >> testing, if successful, return 1; otherwise, return a non 1 value. > >> int * FileRead(char * path); > >> unsigned int DataVerification(int*, int sampleItem); > >> //Verify data fetched from pool > >> int main() > >> { > >> int sampleItem = 0x1; > >> char * fPath = "G:\\workspace\\4G.bin"; > >> unsigned int invalidIO = 0; > >> if (InitPool(length,fPath,sampleItem)!= 1) > >> printf("File write err... \n"); > >> //start do random I/Os from initialized file > >> double start = GetSecs(); > >> int * fetchResult = FileRead(fPath); > >> double end = GetSecs(); > >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - > start)); > >> //start data validation, for 4 bytes fetch only > >> // invalidIO = DataVerification(fetchResult,sampleItem); > >> // if (invalidIO !=0) > >> // { > >> // printf("Total invalid data fetch IOs are %d", invalidIO); > >> // } > >> return 0; > >> } > >> > >> > >> int InitPool(long long length, char* path, int sample) > >> { > >> printf("Start initializing test data ... \n"); > >> FILE * fp = fopen(path,"wb"); > >> if (fp == NULL) > >> { > >> printf("file open err... \n"); > >> exit (-1); > >> } > >> else //initialize file for testing > >> { > >> fseek(fp,0L,SEEK_SET); > >> for (int i=0; i<length; i++) > >> { > >> fwrite(&sample,sizeof(int),1,fp); > >> } > >> fclose(fp); > >> fp = NULL; > >> printf("Data initialization is complete...\n"); > >> return 1; > >> } > >> } > >> double GetSecs(void) > >> { > >> LARGE_INTEGER frequency; > >> LARGE_INTEGER start; > >> if(! QueryPerformanceFrequency(&frequency)) > >> printf("QueryPerformanceFrequency Failed\n"); > >> if(! QueryPerformanceCounter(&start)) > >> printf("QueryPerformanceCounter Failed\n"); > >> return ((double)start.QuadPart/(double)frequency.QuadPart); > >> } > >> class input > >> { > >> public: > >> char *path; > >> int starting; > >> input (int st, char * filePath):starting(st),path(filePath){} > >> }; > >> //Workers > >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) > >> { > >> input * in = (input*) lpThreadParameter; > >> char* path = in->path; > >> FILE * fp = fopen(path,"rb"); > >> int sPos = in->starting; > >> // int * result = in->r; > >> if(fp != NULL) > >> { > >> fpos_t pos; > >> for (int i=0; i<resultArrayLen/threadCount;i++) > >> { > >> pos = i * interval; > >> fsetpos(fp,&pos); > >> //For 512 bytes fetch each time > >> unsigned char *c =new unsigned char [512]; > >> if (fread(c,512,1,fp) ==1) > >> { > >> InterlockedIncrement(&completeIOs); > >> delete c; > >> } > >> //For 4 bytes fetch each time > >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) > >> { > >> InterlockedIncrement(&completeIOs); > >> }*/ > >> else > >> { > >> printf("file read err...\n"); > >> exit(-1); > >> } > >> } > >> fclose(fp); > >> fp = NULL; > >> } > >> else > >> { > >> printf("File open err... \n"); > >> exit(-1); > >> } > >> } > >> int * FileRead(char * p) > >> { > >> printf("Starting reading file ... \n"); > >> HANDLE mWorkThread[256]; //max 256 threads > >> completeIOs = 0; > >> int slice = int (resultArrayLen/threadCount); > >> for(int i = 0; i < threadCount; i++) > >> { > >> mWorkThread[i] = CreateThread( > >> NULL, > >> 0, > >> FileReadThreadEntry, > >> (LPVOID)(new input(i*slice,p)), > >> 0, > >> NULL); > >> } > >> WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE); > >> printf("File read complete... \n"); > >> return result; > >> } > >> unsigned int DataVerification(int* result, int sampleItem) > >> { > >> unsigned int invalid = 0; > >> for (int i=0; i< resultArrayLen/interval;i++) > >> { > >> if (result[i]!=sampleItem) > >> { > >> invalid ++; > >> continue; > >> } > >> } > >> return invalid; > >> } > >> > > > > > > > > > > > ------------------------------------------------------------------------------ > > Virtualization & Cloud Management Using Capacity Planning > > Cloud computing makes use of virtualization - but cloud computing > > also focuses on allowing computing to be delivered as a service. > > http://www.accelacomm.com/jaw/sfnl/114/51521223/ > > _______________________________________________ > > Iometer-devel mailing list > > Iom...@li... > > https://lists.sourceforge.net/lists/listinfo/iometer-devel > |
From: <jo...@ei...> - 2012-02-12 16:34:46
|
82K sounds reasonable for iops on an SSD. You should check the specs of your drive to see what you should expect. You need to remember that you are doing file i/o so you have several layers of cache involved. think of it was file cache -> block cache -> controller cache -> drive cache (you aren't testing a HW RAID, so you probably don't have cache in you controller) My personal run of thumb for random I/O is to have my file size be about 3x my combined cache size. For example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd do a 16GB file. If in iometer you are accessing a PHYSICALDISK, then you are avoiding window's file cache. I just pulled up the code and (keep in mind I'm not much of a windows guy) something looks odd in your GetSecs routine. The cast to double is going to lose resolution, I think I would store the start/end times as LARGE_INTEGER. And you probably only have to call the frequency routine once Also windows used to have issues in the HAL where if a thread got moved to a different processor you'd get odd results. There is a Windows API call for setting affinity, similar to the linux sched_set_affinity. This doesn't really matter for what we are talking about, it is just a pet peeve of mine, your "delete c;" should be "delete [] c;" (are you intending tp be timing your allocator calls as well? you may be if you are simulating system performance, but typically for disk performance you'd try to preallocate as much as possible so your only timing the transfers) If it were me I would start with something simplier, (say single threaded sequential read) and see if your program gets the correct values then. You could also fire up windows performance monitor and try to correlate to its counts as well (PHYSICALDISK transfers/sec). Good Luck, Joe Quoting Nai yan zhao <zha...@gm...>: > Hello Fabian and Joe, > Thank you so much for your reply. > > Actually, what I am trying to do, is to split a file into 32 parts, > and each part will be assigned to a thread to read. Each thread each time > to open file, read 512B, and close file. I was trying to avoid 2 read I/Os > hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most > read I/Os are ordered but not > contiguous<http://en.wikipedia.org/wiki/Contiguity#Computer_science> > . > > By your suggestion, I tried 512B sequential I/O with settings below, > > Max disk size - 8388608 > # of Outstanding I/O - 32 (for 64, it's also around 82K) > Transfer request size - 512B, > 100% sequential > Reply size - no reply > Align I/Os on - Sector boundaries > > The result is around 82K, still much slower than my program. > > If my program has any defect in calculating IOPS? Or if I have any > misunderstanding of caching of SSD or file system, which causes my program > fetches data most from RAM of SSD? Or what parameters I should set in I/O > meter to simulate my program I/O? > > Thank you again in advance for your time to help investigate it!! > > Nai Yan. > > 2012/2/11 Fabian Tillier <fa...@ti...> > >> If I read the test correctly, all threads start at offset 0, and then >> perform 512b reads with a 1024b stride between reads. As Joe said, >> this is pretty much sequential reading, and all threads are reading >> the same data, so most are likely to be satisifed from cache, either >> in the OS or on the SSD itself. They'll do 320000/16=20000 IO >> operations total each, so end up reading 20MB of the file. It's quite >> likely that the whole 20MB that you are reading will sit happilly in >> the file cache. >> >> Create an access pattern that mimics your app (512b sequential with >> 1024b stride), create 32 workers, and see if the results are similar. >> Best would be if you created a test file of 20MB, too. You can then >> see how things compare if you go with async I/O and a single thread. >> >> Cheers, >> -Fab >> >> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >> > Forgive me if I missed it, but I don't see any randomization in your >> > file reads. >> > >> > It looks like you just skip ahead so thread 0 reads the first >> > 512bytes, thread 1 the next 512b. So any storage will be prefetching >> > very effectively. >> > >> > Tell Iometer to do sequential instead of random and see how much >> > closer the numbers are. Or better yet, make your program randomize >> > its reads over the entire disk. >> > >> > Joe >> > >> > >> > Quoting Nai yan zhao <zha...@gm...>: >> > >> >> Greetings, >> >> Could anybody help me a little out of my difficulty? >> >> >> >> I have a SSD and I am trying to use it to simulate my program I/O >> >> performance, however, IOPS calculated from my program is much much >> faster >> >> than IOMeter. >> >> >> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >> >> IOPS is around 94k (queue depth is 32). >> >> However my program (32 windows threads) can reach around 500k 512B >> >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't find >> >> any error in data fetching. It's because my data fetching in order? >> >> >> >> I paste my code belwo (it mainly fetch 512B from file and release >> it; >> >> I did use 4bytes (an int) to validate program logic and didn't find >> >> problem), can anybody help me figure out where I am wrong? >> >> >> >> Thanks so much in advance!! >> >> >> >> Nai Yan. >> >> >> >> #include <stdio.h> >> >> #include <Windows.h> >> >> /* >> >> ** Purpose: Verify file random read IOPS in comparison with IOMeter >> >> ** Author: Nai Yan >> >> ** Date: Feb. 9th, 2012 >> >> **/ >> >> //Global variables >> >> long completeIOs = 0; >> >> long completeBytes = 0; >> >> int threadCount = 32; >> >> unsigned long long length = 1073741824; //4G test file >> >> int interval = 1024; >> >> int resultArrayLen = 320000; >> >> int *result = new int[resultArrayLen]; >> >> //Method declarison >> >> double GetSecs(void); //Calculate out duration >> >> int InitPool(long long,char*,int); //Initialize test data for >> >> testing, if successful, return 1; otherwise, return a non 1 value. >> >> int * FileRead(char * path); >> >> unsigned int DataVerification(int*, int sampleItem); >> >> //Verify data fetched from pool >> >> int main() >> >> { >> >> int sampleItem = 0x1; >> >> char * fPath = "G:\\workspace\\4G.bin"; >> >> unsigned int invalidIO = 0; >> >> if (InitPool(length,fPath,sampleItem)!= 1) >> >> printf("File write err... \n"); >> >> //start do random I/Os from initialized file >> >> double start = GetSecs(); >> >> int * fetchResult = FileRead(fPath); >> >> double end = GetSecs(); >> >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >> start)); >> >> //start data validation, for 4 bytes fetch only >> >> // invalidIO = DataVerification(fetchResult,sampleItem); >> >> // if (invalidIO !=0) >> >> // { >> >> // printf("Total invalid data fetch IOs are %d", invalidIO); >> >> // } >> >> return 0; >> >> } >> >> >> >> >> >> int InitPool(long long length, char* path, int sample) >> >> { >> >> printf("Start initializing test data ... \n"); >> >> FILE * fp = fopen(path,"wb"); >> >> if (fp == NULL) >> >> { >> >> printf("file open err... \n"); >> >> exit (-1); >> >> } >> >> else //initialize file for testing >> >> { >> >> fseek(fp,0L,SEEK_SET); >> >> for (int i=0; i<length; i++) >> >> { >> >> fwrite(&sample,sizeof(int),1,fp); >> >> } >> >> fclose(fp); >> >> fp = NULL; >> >> printf("Data initialization is complete...\n"); >> >> return 1; >> >> } >> >> } >> >> double GetSecs(void) >> >> { >> >> LARGE_INTEGER frequency; >> >> LARGE_INTEGER start; >> >> if(! QueryPerformanceFrequency(&frequency)) >> >> printf("QueryPerformanceFrequency Failed\n"); >> >> if(! QueryPerformanceCounter(&start)) >> >> printf("QueryPerformanceCounter Failed\n"); >> >> return ((double)start.QuadPart/(double)frequency.QuadPart); >> >> } >> >> class input >> >> { >> >> public: >> >> char *path; >> >> int starting; >> >> input (int st, char * filePath):starting(st),path(filePath){} >> >> }; >> >> //Workers >> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >> >> { >> >> input * in = (input*) lpThreadParameter; >> >> char* path = in->path; >> >> FILE * fp = fopen(path,"rb"); >> >> int sPos = in->starting; >> >> // int * result = in->r; >> >> if(fp != NULL) >> >> { >> >> fpos_t pos; >> >> for (int i=0; i<resultArrayLen/threadCount;i++) >> >> { >> >> pos = i * interval; >> >> fsetpos(fp,&pos); >> >> //For 512 bytes fetch each time >> >> unsigned char *c =new unsigned char [512]; >> >> if (fread(c,512,1,fp) ==1) >> >> { >> >> InterlockedIncrement(&completeIOs); >> >> delete c; >> >> } >> >> //For 4 bytes fetch each time >> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >> >> { >> >> InterlockedIncrement(&completeIOs); >> >> }*/ >> >> else >> >> { >> >> printf("file read err...\n"); >> >> exit(-1); >> >> } >> >> } >> >> fclose(fp); >> >> fp = NULL; >> >> } >> >> else >> >> { >> >> printf("File open err... \n"); >> >> exit(-1); >> >> } >> >> } >> >> int * FileRead(char * p) >> >> { >> >> printf("Starting reading file ... \n"); >> >> HANDLE mWorkThread[256]; //max 256 threads >> >> completeIOs = 0; >> >> int slice = int (resultArrayLen/threadCount); >> >> for(int i = 0; i < threadCount; i++) >> >> { >> >> mWorkThread[i] = CreateThread( >> >> NULL, >> >> 0, >> >> FileReadThreadEntry, >> >> (LPVOID)(new input(i*slice,p)), >> >> 0, >> >> NULL); >> >> } >> >> WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE); >> >> printf("File read complete... \n"); >> >> return result; >> >> } >> >> unsigned int DataVerification(int* result, int sampleItem) >> >> { >> >> unsigned int invalid = 0; >> >> for (int i=0; i< resultArrayLen/interval;i++) >> >> { >> >> if (result[i]!=sampleItem) >> >> { >> >> invalid ++; >> >> continue; >> >> } >> >> } >> >> return invalid; >> >> } >> >> >> > >> > >> > >> > >> > >> ------------------------------------------------------------------------------ >> > Virtualization & Cloud Management Using Capacity Planning >> > Cloud computing makes use of virtualization - but cloud computing >> > also focuses on allowing computing to be delivered as a service. >> > http://www.accelacomm.com/jaw/sfnl/114/51521223/ >> > _______________________________________________ >> > Iometer-devel mailing list >> > Iom...@li... >> > https://lists.sourceforge.net/lists/listinfo/iometer-devel >> > |
From: Nai y. z. <zha...@gm...> - 2012-02-12 17:56:35
Attachments:
IOMeter setting & result.doc
fileAccess.cpp
|
Hello Joe, Thank you again for your time! It's wired that from IOMeter, the throughput for sequential IOPS (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, from that SSD official website, this SSD sequential throughput should be around 510MB/s ( http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD is 128G). If there's any parameter I didn't set correctly in IOMeter? As you suggested, I try to create a 12GB sample file (my test bed memory is 6GB and without RAID) and use 1 thread to do IO. The result is 33666; However, with I/O meter, it's 11572 (throughput this time is ONLY 5.93MB/s); IOPS still 3 times!! I attach my IOMeter settings, if there's anything wrong? Also, I attach my modified code. Joe, could you help again to see where's the problem? Thank you so much!! Nai Yan. 2012/2/13 <jo...@ei...> > 82K sounds reasonable for iops on an SSD. You should check the specs of > your drive to see what you should expect. > > You need to remember that you are doing file i/o so you have several > layers of cache involved. think of it was file cache -> block cache -> > controller cache -> drive cache (you aren't testing a HW RAID, so you > probably don't have cache in you controller) My personal run of thumb for > random I/O is to have my file size be about 3x my combined cache size. For > example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd > do a 16GB file. > > If in iometer you are accessing a PHYSICALDISK, then you are avoiding > window's file cache. > > I just pulled up the code and (keep in mind I'm not much of a windows guy) > something looks odd in your GetSecs routine. The cast to double is going to > lose resolution, I think I would store the start/end times as > LARGE_INTEGER. And you probably only have to call the frequency routine once > > Also windows used to have issues in the HAL where if a thread got moved to > a different processor you'd get odd results. There is a Windows API call > for setting affinity, similar to the linux sched_set_affinity. > > This doesn't really matter for what we are talking about, it is just a pet > peeve of mine, your "delete c;" should be "delete [] c;" (are you intending > tp be timing your allocator calls as well? you may be if you are simulating > system performance, but typically for disk performance you'd try to > preallocate as much as possible so your only timing the transfers) > > > If it were me I would start with something simplier, (say single threaded > sequential read) and see if your program gets the correct values then. You > could also fire up windows performance monitor and try to correlate to its > counts as well (PHYSICALDISK transfers/sec). > > Good Luck, > > Joe > > > > Quoting Nai yan zhao <zha...@gm...>: > > Hello Fabian and Joe, >> Thank you so much for your reply. >> >> Actually, what I am trying to do, is to split a file into 32 parts, >> and each part will be assigned to a thread to read. Each thread each time >> to open file, read 512B, and close file. I was trying to avoid 2 read >> I/Os >> hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most >> read I/Os are ordered but not >> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >> > >> . >> >> >> By your suggestion, I tried 512B sequential I/O with settings below, >> >> Max disk size - 8388608 >> # of Outstanding I/O - 32 (for 64, it's also around 82K) >> Transfer request size - 512B, >> 100% sequential >> Reply size - no reply >> Align I/Os on - Sector boundaries >> >> The result is around 82K, still much slower than my program. >> >> If my program has any defect in calculating IOPS? Or if I have any >> misunderstanding of caching of SSD or file system, which causes my program >> fetches data most from RAM of SSD? Or what parameters I should set in I/O >> meter to simulate my program I/O? >> >> Thank you again in advance for your time to help investigate it!! >> >> Nai Yan. >> >> 2012/2/11 Fabian Tillier <fa...@ti...> >> >> If I read the test correctly, all threads start at offset 0, and then >>> perform 512b reads with a 1024b stride between reads. As Joe said, >>> this is pretty much sequential reading, and all threads are reading >>> the same data, so most are likely to be satisifed from cache, either >>> in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>> operations total each, so end up reading 20MB of the file. It's quite >>> likely that the whole 20MB that you are reading will sit happilly in >>> the file cache. >>> >>> Create an access pattern that mimics your app (512b sequential with >>> 1024b stride), create 32 workers, and see if the results are similar. >>> Best would be if you created a test file of 20MB, too. You can then >>> see how things compare if you go with async I/O and a single thread. >>> >>> Cheers, >>> -Fab >>> >>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>> > Forgive me if I missed it, but I don't see any randomization in your >>> > file reads. >>> > >>> > It looks like you just skip ahead so thread 0 reads the first >>> > 512bytes, thread 1 the next 512b. So any storage will be prefetching >>> > very effectively. >>> > >>> > Tell Iometer to do sequential instead of random and see how much >>> > closer the numbers are. Or better yet, make your program randomize >>> > its reads over the entire disk. >>> > >>> > Joe >>> > >>> > >>> > Quoting Nai yan zhao <zha...@gm...>: >>> > >>> >> Greetings, >>> >> Could anybody help me a little out of my difficulty? >>> >> >>> >> I have a SSD and I am trying to use it to simulate my program I/O >>> >> performance, however, IOPS calculated from my program is much much >>> faster >>> >> than IOMeter. >>> >> >>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >>> >> IOPS is around 94k (queue depth is 32). >>> >> However my program (32 windows threads) can reach around 500k >>> 512B >>> >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>> find >>> >> any error in data fetching. It's because my data fetching in order? >>> >> >>> >> I paste my code belwo (it mainly fetch 512B from file and release >>> it; >>> >> I did use 4bytes (an int) to validate program logic and didn't find >>> >> problem), can anybody help me figure out where I am wrong? >>> >> >>> >> Thanks so much in advance!! >>> >> >>> >> Nai Yan. >>> >> >>> >> #include <stdio.h> >>> >> #include <Windows.h> >>> >> /* >>> >> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>> >> ** Author: Nai Yan >>> >> ** Date: Feb. 9th, 2012 >>> >> **/ >>> >> //Global variables >>> >> long completeIOs = 0; >>> >> long completeBytes = 0; >>> >> int threadCount = 32; >>> >> unsigned long long length = 1073741824; //4G test >>> file >>> >> int interval = 1024; >>> >> int resultArrayLen = 320000; >>> >> int *result = new int[resultArrayLen]; >>> >> //Method declarison >>> >> double GetSecs(void); //Calculate out duration >>> >> int InitPool(long long,char*,int); //Initialize test data for >>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>> >> int * FileRead(char * path); >>> >> unsigned int DataVerification(int*, int sampleItem); >>> >> //Verify data fetched from pool >>> >> int main() >>> >> { >>> >> int sampleItem = 0x1; >>> >> char * fPath = "G:\\workspace\\4G.bin"; >>> >> unsigned int invalidIO = 0; >>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>> >> printf("File write err... \n"); >>> >> //start do random I/Os from initialized file >>> >> double start = GetSecs(); >>> >> int * fetchResult = FileRead(fPath); >>> >> double end = GetSecs(); >>> >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>> start)); >>> >> //start data validation, for 4 bytes fetch only >>> >> // invalidIO = DataVerification(fetchResult,**sampleItem); >>> >> // if (invalidIO !=0) >>> >> // { >>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); >>> >> // } >>> >> return 0; >>> >> } >>> >> >>> >> >>> >> int InitPool(long long length, char* path, int sample) >>> >> { >>> >> printf("Start initializing test data ... \n"); >>> >> FILE * fp = fopen(path,"wb"); >>> >> if (fp == NULL) >>> >> { >>> >> printf("file open err... \n"); >>> >> exit (-1); >>> >> } >>> >> else //initialize file for testing >>> >> { >>> >> fseek(fp,0L,SEEK_SET); >>> >> for (int i=0; i<length; i++) >>> >> { >>> >> fwrite(&sample,sizeof(int),1,**fp); >>> >> } >>> >> fclose(fp); >>> >> fp = NULL; >>> >> printf("Data initialization is complete...\n"); >>> >> return 1; >>> >> } >>> >> } >>> >> double GetSecs(void) >>> >> { >>> >> LARGE_INTEGER frequency; >>> >> LARGE_INTEGER start; >>> >> if(! QueryPerformanceFrequency(&**frequency)) >>> >> printf("**QueryPerformanceFrequency Failed\n"); >>> >> if(! QueryPerformanceCounter(&**start)) >>> >> printf("**QueryPerformanceCounter Failed\n"); >>> >> return ((double)start.QuadPart/(**double)frequency.QuadPart); >>> >> } >>> >> class input >>> >> { >>> >> public: >>> >> char *path; >>> >> int starting; >>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>> >> }; >>> >> //Workers >>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>> >> { >>> >> input * in = (input*) lpThreadParameter; >>> >> char* path = in->path; >>> >> FILE * fp = fopen(path,"rb"); >>> >> int sPos = in->starting; >>> >> // int * result = in->r; >>> >> if(fp != NULL) >>> >> { >>> >> fpos_t pos; >>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>> >> { >>> >> pos = i * interval; >>> >> fsetpos(fp,&pos); >>> >> //For 512 bytes fetch each time >>> >> unsigned char *c =new unsigned char [512]; >>> >> if (fread(c,512,1,fp) ==1) >>> >> { >>> >> InterlockedIncrement(&**completeIOs); >>> >> delete c; >>> >> } >>> >> //For 4 bytes fetch each time >>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>> >> { >>> >> InterlockedIncrement(&**completeIOs); >>> >> }*/ >>> >> else >>> >> { >>> >> printf("file read err...\n"); >>> >> exit(-1); >>> >> } >>> >> } >>> >> fclose(fp); >>> >> fp = NULL; >>> >> } >>> >> else >>> >> { >>> >> printf("File open err... \n"); >>> >> exit(-1); >>> >> } >>> >> } >>> >> int * FileRead(char * p) >>> >> { >>> >> printf("Starting reading file ... \n"); >>> >> HANDLE mWorkThread[256]; //max 256 threads >>> >> completeIOs = 0; >>> >> int slice = int (resultArrayLen/threadCount); >>> >> for(int i = 0; i < threadCount; i++) >>> >> { >>> >> mWorkThread[i] = CreateThread( >>> >> NULL, >>> >> 0, >>> >> FileReadThreadEntry, >>> >> (LPVOID)(new input(i*slice,p)), >>> >> 0, >>> >> NULL); >>> >> } >>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, INFINITE); >>> >> printf("File read complete... \n"); >>> >> return result; >>> >> } >>> >> unsigned int DataVerification(int* result, int sampleItem) >>> >> { >>> >> unsigned int invalid = 0; >>> >> for (int i=0; i< resultArrayLen/interval;i++) >>> >> { >>> >> if (result[i]!=sampleItem) >>> >> { >>> >> invalid ++; >>> >> continue; >>> >> } >>> >> } >>> >> return invalid; >>> >> } >>> >> >>> > >>> > >>> > >>> > >>> > >>> ------------------------------**------------------------------** >>> ------------------ >>> > Virtualization & Cloud Management Using Capacity Planning >>> > Cloud computing makes use of virtualization - but cloud computing >>> > also focuses on allowing computing to be delivered as a service. >>> > http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>> > ______________________________**_________________ >>> > Iometer-devel mailing list >>> > Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>> > https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>> >>> >> > > > |
From: <jo...@ei...> - 2012-02-12 20:34:58
|
Manufacturer's quoted sequential MB/s won't be with 512byte reads. In Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That should come closer to the maximum throughput(I doubt you'll be able to get your laptop to actually get close to 520MB/s though). I'll see if I can find a windows system to try to compile/run your program, but I can't make any promises. Joe Quoting Nai yan zhao <zha...@gm...>: > Hello Joe, > Thank you again for your time! > It's wired that from IOMeter, the throughput for sequential IOPS > (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, > from that SSD official website, this SSD sequential throughput should be > around 510MB/s ( > http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD > is 128G). If there's any parameter I didn't set correctly in IOMeter? > > As you suggested, I try to create a 12GB sample file (my test bed > memory is 6GB and without RAID) and use 1 thread to do IO. The result > is 33666; However, with I/O meter, it's 11572 (throughput this time is ONLY > 5.93MB/s); IOPS still 3 times!! > > I attach my IOMeter settings, if there's anything wrong? Also, I > attach my modified code. Joe, could you help again to see where's the > problem? > > Thank you so much!! > > Nai Yan. > > 2012/2/13 <jo...@ei...> > >> 82K sounds reasonable for iops on an SSD. You should check the specs of >> your drive to see what you should expect. >> >> You need to remember that you are doing file i/o so you have several >> layers of cache involved. think of it was file cache -> block cache -> >> controller cache -> drive cache (you aren't testing a HW RAID, so you >> probably don't have cache in you controller) My personal run of thumb for >> random I/O is to have my file size be about 3x my combined cache size. For >> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd >> do a 16GB file. >> >> If in iometer you are accessing a PHYSICALDISK, then you are avoiding >> window's file cache. >> >> I just pulled up the code and (keep in mind I'm not much of a windows guy) >> something looks odd in your GetSecs routine. The cast to double is going to >> lose resolution, I think I would store the start/end times as >> LARGE_INTEGER. And you probably only have to call the frequency routine once >> >> Also windows used to have issues in the HAL where if a thread got moved to >> a different processor you'd get odd results. There is a Windows API call >> for setting affinity, similar to the linux sched_set_affinity. >> >> This doesn't really matter for what we are talking about, it is just a pet >> peeve of mine, your "delete c;" should be "delete [] c;" (are you intending >> tp be timing your allocator calls as well? you may be if you are simulating >> system performance, but typically for disk performance you'd try to >> preallocate as much as possible so your only timing the transfers) >> >> >> If it were me I would start with something simplier, (say single threaded >> sequential read) and see if your program gets the correct values then. You >> could also fire up windows performance monitor and try to correlate to its >> counts as well (PHYSICALDISK transfers/sec). >> >> Good Luck, >> >> Joe >> >> >> >> Quoting Nai yan zhao <zha...@gm...>: >> >> Hello Fabian and Joe, >>> Thank you so much for your reply. >>> >>> Actually, what I am trying to do, is to split a file into 32 parts, >>> and each part will be assigned to a thread to read. Each thread each time >>> to open file, read 512B, and close file. I was trying to avoid 2 read >>> I/Os >>> hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most >>> read I/Os are ordered but not >>> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>> > >>> . >>> >>> >>> By your suggestion, I tried 512B sequential I/O with settings below, >>> >>> Max disk size - 8388608 >>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>> Transfer request size - 512B, >>> 100% sequential >>> Reply size - no reply >>> Align I/Os on - Sector boundaries >>> >>> The result is around 82K, still much slower than my program. >>> >>> If my program has any defect in calculating IOPS? Or if I have any >>> misunderstanding of caching of SSD or file system, which causes my program >>> fetches data most from RAM of SSD? Or what parameters I should set in I/O >>> meter to simulate my program I/O? >>> >>> Thank you again in advance for your time to help investigate it!! >>> >>> Nai Yan. >>> >>> 2012/2/11 Fabian Tillier <fa...@ti...> >>> >>> If I read the test correctly, all threads start at offset 0, and then >>>> perform 512b reads with a 1024b stride between reads. As Joe said, >>>> this is pretty much sequential reading, and all threads are reading >>>> the same data, so most are likely to be satisifed from cache, either >>>> in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>>> operations total each, so end up reading 20MB of the file. It's quite >>>> likely that the whole 20MB that you are reading will sit happilly in >>>> the file cache. >>>> >>>> Create an access pattern that mimics your app (512b sequential with >>>> 1024b stride), create 32 workers, and see if the results are similar. >>>> Best would be if you created a test file of 20MB, too. You can then >>>> see how things compare if you go with async I/O and a single thread. >>>> >>>> Cheers, >>>> -Fab >>>> >>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>> > Forgive me if I missed it, but I don't see any randomization in your >>>> > file reads. >>>> > >>>> > It looks like you just skip ahead so thread 0 reads the first >>>> > 512bytes, thread 1 the next 512b. So any storage will be prefetching >>>> > very effectively. >>>> > >>>> > Tell Iometer to do sequential instead of random and see how much >>>> > closer the numbers are. Or better yet, make your program randomize >>>> > its reads over the entire disk. >>>> > >>>> > Joe >>>> > >>>> > >>>> > Quoting Nai yan zhao <zha...@gm...>: >>>> > >>>> >> Greetings, >>>> >> Could anybody help me a little out of my difficulty? >>>> >> >>>> >> I have a SSD and I am trying to use it to simulate my program I/O >>>> >> performance, however, IOPS calculated from my program is much much >>>> faster >>>> >> than IOMeter. >>>> >> >>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >>>> >> IOPS is around 94k (queue depth is 32). >>>> >> However my program (32 windows threads) can reach around 500k >>>> 512B >>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>>> find >>>> >> any error in data fetching. It's because my data fetching in order? >>>> >> >>>> >> I paste my code belwo (it mainly fetch 512B from file and release >>>> it; >>>> >> I did use 4bytes (an int) to validate program logic and didn't find >>>> >> problem), can anybody help me figure out where I am wrong? >>>> >> >>>> >> Thanks so much in advance!! >>>> >> >>>> >> Nai Yan. >>>> >> >>>> >> #include <stdio.h> >>>> >> #include <Windows.h> >>>> >> /* >>>> >> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>>> >> ** Author: Nai Yan >>>> >> ** Date: Feb. 9th, 2012 >>>> >> **/ >>>> >> //Global variables >>>> >> long completeIOs = 0; >>>> >> long completeBytes = 0; >>>> >> int threadCount = 32; >>>> >> unsigned long long length = 1073741824; //4G test >>>> file >>>> >> int interval = 1024; >>>> >> int resultArrayLen = 320000; >>>> >> int *result = new int[resultArrayLen]; >>>> >> //Method declarison >>>> >> double GetSecs(void); //Calculate out duration >>>> >> int InitPool(long long,char*,int); //Initialize test data for >>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>> >> int * FileRead(char * path); >>>> >> unsigned int DataVerification(int*, int sampleItem); >>>> >> //Verify data fetched from pool >>>> >> int main() >>>> >> { >>>> >> int sampleItem = 0x1; >>>> >> char * fPath = "G:\\workspace\\4G.bin"; >>>> >> unsigned int invalidIO = 0; >>>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>>> >> printf("File write err... \n"); >>>> >> //start do random I/Os from initialized file >>>> >> double start = GetSecs(); >>>> >> int * fetchResult = FileRead(fPath); >>>> >> double end = GetSecs(); >>>> >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>>> start)); >>>> >> //start data validation, for 4 bytes fetch only >>>> >> // invalidIO = DataVerification(fetchResult,**sampleItem); >>>> >> // if (invalidIO !=0) >>>> >> // { >>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); >>>> >> // } >>>> >> return 0; >>>> >> } >>>> >> >>>> >> >>>> >> int InitPool(long long length, char* path, int sample) >>>> >> { >>>> >> printf("Start initializing test data ... \n"); >>>> >> FILE * fp = fopen(path,"wb"); >>>> >> if (fp == NULL) >>>> >> { >>>> >> printf("file open err... \n"); >>>> >> exit (-1); >>>> >> } >>>> >> else //initialize file for testing >>>> >> { >>>> >> fseek(fp,0L,SEEK_SET); >>>> >> for (int i=0; i<length; i++) >>>> >> { >>>> >> fwrite(&sample,sizeof(int),1,**fp); >>>> >> } >>>> >> fclose(fp); >>>> >> fp = NULL; >>>> >> printf("Data initialization is complete...\n"); >>>> >> return 1; >>>> >> } >>>> >> } >>>> >> double GetSecs(void) >>>> >> { >>>> >> LARGE_INTEGER frequency; >>>> >> LARGE_INTEGER start; >>>> >> if(! QueryPerformanceFrequency(&**frequency)) >>>> >> printf("**QueryPerformanceFrequency Failed\n"); >>>> >> if(! QueryPerformanceCounter(&**start)) >>>> >> printf("**QueryPerformanceCounter Failed\n"); >>>> >> return ((double)start.QuadPart/(**double)frequency.QuadPart); >>>> >> } >>>> >> class input >>>> >> { >>>> >> public: >>>> >> char *path; >>>> >> int starting; >>>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>>> >> }; >>>> >> //Workers >>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>>> >> { >>>> >> input * in = (input*) lpThreadParameter; >>>> >> char* path = in->path; >>>> >> FILE * fp = fopen(path,"rb"); >>>> >> int sPos = in->starting; >>>> >> // int * result = in->r; >>>> >> if(fp != NULL) >>>> >> { >>>> >> fpos_t pos; >>>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>> >> { >>>> >> pos = i * interval; >>>> >> fsetpos(fp,&pos); >>>> >> //For 512 bytes fetch each time >>>> >> unsigned char *c =new unsigned char [512]; >>>> >> if (fread(c,512,1,fp) ==1) >>>> >> { >>>> >> InterlockedIncrement(&**completeIOs); >>>> >> delete c; >>>> >> } >>>> >> //For 4 bytes fetch each time >>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>>> >> { >>>> >> InterlockedIncrement(&**completeIOs); >>>> >> }*/ >>>> >> else >>>> >> { >>>> >> printf("file read err...\n"); >>>> >> exit(-1); >>>> >> } >>>> >> } >>>> >> fclose(fp); >>>> >> fp = NULL; >>>> >> } >>>> >> else >>>> >> { >>>> >> printf("File open err... \n"); >>>> >> exit(-1); >>>> >> } >>>> >> } >>>> >> int * FileRead(char * p) >>>> >> { >>>> >> printf("Starting reading file ... \n"); >>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>> >> completeIOs = 0; >>>> >> int slice = int (resultArrayLen/threadCount); >>>> >> for(int i = 0; i < threadCount; i++) >>>> >> { >>>> >> mWorkThread[i] = CreateThread( >>>> >> NULL, >>>> >> 0, >>>> >> FileReadThreadEntry, >>>> >> (LPVOID)(new input(i*slice,p)), >>>> >> 0, >>>> >> NULL); >>>> >> } >>>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, INFINITE); >>>> >> printf("File read complete... \n"); >>>> >> return result; >>>> >> } >>>> >> unsigned int DataVerification(int* result, int sampleItem) >>>> >> { >>>> >> unsigned int invalid = 0; >>>> >> for (int i=0; i< resultArrayLen/interval;i++) >>>> >> { >>>> >> if (result[i]!=sampleItem) >>>> >> { >>>> >> invalid ++; >>>> >> continue; >>>> >> } >>>> >> } >>>> >> return invalid; >>>> >> } >>>> >> >>>> > >>>> > >>>> > >>>> > >>>> > >>>> ------------------------------**------------------------------** >>>> ------------------ >>>> > Virtualization & Cloud Management Using Capacity Planning >>>> > Cloud computing makes use of virtualization - but cloud computing >>>> > also focuses on allowing computing to be delivered as a service. >>>> > >>>> http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>>> > ______________________________**_________________ >>>> > Iometer-devel mailing list >>>> > >>>> Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>>> > >>>> https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>> >>>> >>> >> >> >> > |
From: Nai y. z. <zha...@gm...> - 2012-02-13 03:00:05
|
Hello Joe, Again, thank you for your reply! I will take your suggestion and try again. But I am very looking forward to your further investigation on Windows system for my program. I trust IOMeter, but I can't explain why and where's the problem with my program. And further speaking, would you give me some comments? 1) What's the difference between IOmeter I/O calculation and my program (although it's much much simpler)? From the behavior of IOMeter, it also seems to create a file on target disk and MAYBE fetch data from that file by pre-defined I/O size and policy. If I am wrong? If I am not wrong, then why there's so much difference. Joe, by your experience, if my program has any big defect? 2) My major purpose is to have a program in our production env. ,which will frequently fetch data from SSD, and there are also some additional operations/work after data fetched - this is also why you see I put some additional work after each I/O (such as memory allocation and de-allocation in I/O calculation). What I expect to see, its benchmark SHOULD be less than I/OMeter benchmark. Would you advise more? Is there any big defect in my program for either doing file I/O or I/O calculation? Thanks in advance!! Nai Yan. 2012/2/13 <jo...@ei...> > Manufacturer's quoted sequential MB/s won't be with 512byte reads. In > Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That > should come closer to the maximum throughput(I doubt you'll be able to get > your laptop to actually get close to 520MB/s though). > > I'll see if I can find a windows system to try to compile/run your > program, but I can't make any promises. > > > Joe > > > Quoting Nai yan zhao <zha...@gm...>: > > Hello Joe, >> Thank you again for your time! >> It's wired that from IOMeter, the throughput for sequential IOPS >> (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, >> from that SSD official website, this SSD sequential throughput should be >> around 510MB/s ( >> http://www.plextoramericas.**com/index.php/ssd/px-m3-**series?start=1<http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1>, >> my SSD >> is 128G). If there's any parameter I didn't set correctly in IOMeter? >> >> As you suggested, I try to create a 12GB sample file (my test bed >> memory is 6GB and without RAID) and use 1 thread to do IO. The result >> is 33666; However, with I/O meter, it's 11572 (throughput this time is >> ONLY >> 5.93MB/s); IOPS still 3 times!! >> >> I attach my IOMeter settings, if there's anything wrong? Also, I >> attach my modified code. Joe, could you help again to see where's the >> problem? >> >> Thank you so much!! >> >> Nai Yan. >> >> 2012/2/13 <jo...@ei...> >> >> 82K sounds reasonable for iops on an SSD. You should check the specs of >>> your drive to see what you should expect. >>> >>> You need to remember that you are doing file i/o so you have several >>> layers of cache involved. think of it was file cache -> block cache -> >>> controller cache -> drive cache (you aren't testing a HW RAID, so you >>> probably don't have cache in you controller) My personal run of thumb for >>> random I/O is to have my file size be about 3x my combined cache size. >>> For >>> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd >>> do a 16GB file. >>> >>> If in iometer you are accessing a PHYSICALDISK, then you are avoiding >>> window's file cache. >>> >>> I just pulled up the code and (keep in mind I'm not much of a windows >>> guy) >>> something looks odd in your GetSecs routine. The cast to double is going >>> to >>> lose resolution, I think I would store the start/end times as >>> LARGE_INTEGER. And you probably only have to call the frequency routine >>> once >>> >>> Also windows used to have issues in the HAL where if a thread got moved >>> to >>> a different processor you'd get odd results. There is a Windows API call >>> for setting affinity, similar to the linux sched_set_affinity. >>> >>> This doesn't really matter for what we are talking about, it is just a >>> pet >>> peeve of mine, your "delete c;" should be "delete [] c;" (are you >>> intending >>> tp be timing your allocator calls as well? you may be if you are >>> simulating >>> system performance, but typically for disk performance you'd try to >>> preallocate as much as possible so your only timing the transfers) >>> >>> >>> If it were me I would start with something simplier, (say single threaded >>> sequential read) and see if your program gets the correct values then. >>> You >>> could also fire up windows performance monitor and try to correlate to >>> its >>> counts as well (PHYSICALDISK transfers/sec). >>> >>> Good Luck, >>> >>> Joe >>> >>> >>> >>> Quoting Nai yan zhao <zha...@gm...>: >>> >>> Hello Fabian and Joe, >>> >>>> Thank you so much for your reply. >>>> >>>> Actually, what I am trying to do, is to split a file into 32 parts, >>>> and each part will be assigned to a thread to read. Each thread each >>>> time >>>> to open file, read 512B, and close file. I was trying to avoid 2 read >>>> I/Os >>>> hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although >>>> most >>>> read I/Os are ordered but not >>>> contiguous<http://en.**wikiped**ia.org/wiki/Contiguity#**** >>>> Computer_science<http://wikipedia.org/wiki/Contiguity#**Computer_science> >>>> <http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>> > >>>> >>>> > >>>> . >>>> >>>> >>>> By your suggestion, I tried 512B sequential I/O with settings below, >>>> >>>> Max disk size - 8388608 >>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>> Transfer request size - 512B, >>>> 100% sequential >>>> Reply size - no reply >>>> Align I/Os on - Sector boundaries >>>> >>>> The result is around 82K, still much slower than my program. >>>> >>>> If my program has any defect in calculating IOPS? Or if I have any >>>> misunderstanding of caching of SSD or file system, which causes my >>>> program >>>> fetches data most from RAM of SSD? Or what parameters I should set in >>>> I/O >>>> meter to simulate my program I/O? >>>> >>>> Thank you again in advance for your time to help investigate it!! >>>> >>>> Nai Yan. >>>> >>>> 2012/2/11 Fabian Tillier <fa...@ti...> >>>> >>>> If I read the test correctly, all threads start at offset 0, and then >>>> >>>>> perform 512b reads with a 1024b stride between reads. As Joe said, >>>>> this is pretty much sequential reading, and all threads are reading >>>>> the same data, so most are likely to be satisifed from cache, either >>>>> in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>>>> operations total each, so end up reading 20MB of the file. It's quite >>>>> likely that the whole 20MB that you are reading will sit happilly in >>>>> the file cache. >>>>> >>>>> Create an access pattern that mimics your app (512b sequential with >>>>> 1024b stride), create 32 workers, and see if the results are similar. >>>>> Best would be if you created a test file of 20MB, too. You can then >>>>> see how things compare if you go with async I/O and a single thread. >>>>> >>>>> Cheers, >>>>> -Fab >>>>> >>>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>> > Forgive me if I missed it, but I don't see any randomization in your >>>>> > file reads. >>>>> > >>>>> > It looks like you just skip ahead so thread 0 reads the first >>>>> > 512bytes, thread 1 the next 512b. So any storage will be prefetching >>>>> > very effectively. >>>>> > >>>>> > Tell Iometer to do sequential instead of random and see how much >>>>> > closer the numbers are. Or better yet, make your program randomize >>>>> > its reads over the entire disk. >>>>> > >>>>> > Joe >>>>> > >>>>> > >>>>> > Quoting Nai yan zhao <zha...@gm...>: >>>>> > >>>>> >> Greetings, >>>>> >> Could anybody help me a little out of my difficulty? >>>>> >> >>>>> >> I have a SSD and I am trying to use it to simulate my program >>>>> I/O >>>>> >> performance, however, IOPS calculated from my program is much much >>>>> faster >>>>> >> than IOMeter. >>>>> >> >>>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random >>>>> read >>>>> >> IOPS is around 94k (queue depth is 32). >>>>> >> However my program (32 windows threads) can reach around 500k >>>>> 512B >>>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>>>> find >>>>> >> any error in data fetching. It's because my data fetching in order? >>>>> >> >>>>> >> I paste my code belwo (it mainly fetch 512B from file and >>>>> release >>>>> it; >>>>> >> I did use 4bytes (an int) to validate program logic and didn't find >>>>> >> problem), can anybody help me figure out where I am wrong? >>>>> >> >>>>> >> Thanks so much in advance!! >>>>> >> >>>>> >> Nai Yan. >>>>> >> >>>>> >> #include <stdio.h> >>>>> >> #include <Windows.h> >>>>> >> /* >>>>> >> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>>>> >> ** Author: Nai Yan >>>>> >> ** Date: Feb. 9th, 2012 >>>>> >> **/ >>>>> >> //Global variables >>>>> >> long completeIOs = 0; >>>>> >> long completeBytes = 0; >>>>> >> int threadCount = 32; >>>>> >> unsigned long long length = 1073741824; //4G test >>>>> file >>>>> >> int interval = 1024; >>>>> >> int resultArrayLen = 320000; >>>>> >> int *result = new int[resultArrayLen]; >>>>> >> //Method declarison >>>>> >> double GetSecs(void); //Calculate out duration >>>>> >> int InitPool(long long,char*,int); //Initialize test data >>>>> for >>>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>>> >> int * FileRead(char * path); >>>>> >> unsigned int DataVerification(int*, int sampleItem); >>>>> >> //Verify data fetched from pool >>>>> >> int main() >>>>> >> { >>>>> >> int sampleItem = 0x1; >>>>> >> char * fPath = "G:\\workspace\\4G.bin"; >>>>> >> unsigned int invalidIO = 0; >>>>> >> if (InitPool(length,fPath,****sampleItem)!= 1) >>>>> >>>>> >> printf("File write err... \n"); >>>>> >> //start do random I/Os from initialized file >>>>> >> double start = GetSecs(); >>>>> >> int * fetchResult = FileRead(fPath); >>>>> >> double end = GetSecs(); >>>>> >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>>>> start)); >>>>> >> //start data validation, for 4 bytes fetch only >>>>> >> // invalidIO = DataVerification(fetchResult,****sampleItem); >>>>> >>>>> >> // if (invalidIO !=0) >>>>> >> // { >>>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); >>>>> >> // } >>>>> >> return 0; >>>>> >> } >>>>> >> >>>>> >> >>>>> >> int InitPool(long long length, char* path, int sample) >>>>> >> { >>>>> >> printf("Start initializing test data ... \n"); >>>>> >> FILE * fp = fopen(path,"wb"); >>>>> >> if (fp == NULL) >>>>> >> { >>>>> >> printf("file open err... \n"); >>>>> >> exit (-1); >>>>> >> } >>>>> >> else //initialize file for testing >>>>> >> { >>>>> >> fseek(fp,0L,SEEK_SET); >>>>> >> for (int i=0; i<length; i++) >>>>> >> { >>>>> >> fwrite(&sample,sizeof(int),1,****fp); >>>>> >>>>> >> } >>>>> >> fclose(fp); >>>>> >> fp = NULL; >>>>> >> printf("Data initialization is complete...\n"); >>>>> >> return 1; >>>>> >> } >>>>> >> } >>>>> >> double GetSecs(void) >>>>> >> { >>>>> >> LARGE_INTEGER frequency; >>>>> >> LARGE_INTEGER start; >>>>> >> if(! QueryPerformanceFrequency(&****frequency)) >>>>> >> printf("****QueryPerformanceFrequency Failed\n"); >>>>> >> if(! QueryPerformanceCounter(&****start)) >>>>> >> printf("****QueryPerformanceCounter Failed\n"); >>>>> >> return ((double)start.QuadPart/(****double)frequency.QuadPart); >>>>> >>>>> >> } >>>>> >> class input >>>>> >> { >>>>> >> public: >>>>> >> char *path; >>>>> >> int starting; >>>>> >> input (int st, char * filePath):starting(st),path(****filePath){} >>>>> >>>>> >> }; >>>>> >> //Workers >>>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>>>> >> { >>>>> >> input * in = (input*) lpThreadParameter; >>>>> >> char* path = in->path; >>>>> >> FILE * fp = fopen(path,"rb"); >>>>> >> int sPos = in->starting; >>>>> >> // int * result = in->r; >>>>> >> if(fp != NULL) >>>>> >> { >>>>> >> fpos_t pos; >>>>> >> for (int i=0; i<resultArrayLen/threadCount;****i++) >>>>> >>>>> >> { >>>>> >> pos = i * interval; >>>>> >> fsetpos(fp,&pos); >>>>> >> //For 512 bytes fetch each time >>>>> >> unsigned char *c =new unsigned char [512]; >>>>> >> if (fread(c,512,1,fp) ==1) >>>>> >> { >>>>> >> InterlockedIncrement(&****completeIOs); >>>>> >>>>> >> delete c; >>>>> >> } >>>>> >> //For 4 bytes fetch each time >>>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>>>> >> { >>>>> >> InterlockedIncrement(&****completeIOs); >>>>> >>>>> >> }*/ >>>>> >> else >>>>> >> { >>>>> >> printf("file read err...\n"); >>>>> >> exit(-1); >>>>> >> } >>>>> >> } >>>>> >> fclose(fp); >>>>> >> fp = NULL; >>>>> >> } >>>>> >> else >>>>> >> { >>>>> >> printf("File open err... \n"); >>>>> >> exit(-1); >>>>> >> } >>>>> >> } >>>>> >> int * FileRead(char * p) >>>>> >> { >>>>> >> printf("Starting reading file ... \n"); >>>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>>> >> completeIOs = 0; >>>>> >> int slice = int (resultArrayLen/threadCount); >>>>> >> for(int i = 0; i < threadCount; i++) >>>>> >> { >>>>> >> mWorkThread[i] = CreateThread( >>>>> >> NULL, >>>>> >> 0, >>>>> >> FileReadThreadEntry, >>>>> >> (LPVOID)(new input(i*slice,p)), >>>>> >> 0, >>>>> >> NULL); >>>>> >> } >>>>> >> WaitForMultipleObjects(****threadCount, mWorkThread, TRUE, >>>>> INFINITE); >>>>> >>>>> >> printf("File read complete... \n"); >>>>> >> return result; >>>>> >> } >>>>> >> unsigned int DataVerification(int* result, int sampleItem) >>>>> >> { >>>>> >> unsigned int invalid = 0; >>>>> >> for (int i=0; i< resultArrayLen/interval;i++) >>>>> >> { >>>>> >> if (result[i]!=sampleItem) >>>>> >> { >>>>> >> invalid ++; >>>>> >> continue; >>>>> >> } >>>>> >> } >>>>> >> return invalid; >>>>> >> } >>>>> >> >>>>> > >>>>> > >>>>> > >>>>> > >>>>> > >>>>> ------------------------------****----------------------------**--** >>>>> >>>>> ------------------ >>>>> > Virtualization & Cloud Management Using Capacity Planning >>>>> > Cloud computing makes use of virtualization - but cloud computing >>>>> > also focuses on allowing computing to be delivered as a service. >>>>> > http://www.accelacomm.com/jaw/****sfnl/114/51521223/<http://www.accelacomm.com/jaw/**sfnl/114/51521223/> >>>>> <http://**www.accelacomm.com/jaw/sfnl/**114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>>>> > >>>>> > ______________________________****_________________ >>>>> > Iometer-devel mailing list >>>>> > Iometer-devel@lists.**sourcefo**rge.net <http://sourceforge.net>< >>>>> Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>>>> > >>>>> > https://lists.sourceforge.net/****lists/listinfo/iometer-devel<https://lists.sourceforge.net/**lists/listinfo/iometer-devel> >>>>> **<https://lists.sourceforge.**net/lists/listinfo/iometer-**devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>> > >>>>> >>>>> >>>>> >>>> >>> >>> >>> >> > > > |
From: Fabian T. <fa...@ti...> - 2012-02-13 21:33:19
|
Hi Nai Yan, 2012/2/12 Nai yan zhao <zha...@gm...>: > Hello Joe, > Again, thank you for your reply! I will take your suggestion and try > again. But I am very looking forward to your further investigation on > Windows system for my program. > > I trust IOMeter, but I can't explain why and where's the problem with > my program. And further speaking, would you give me some comments? > 1) What's the difference between IOmeter I/O calculation and my program > (although it's much much simpler)? From the behavior of IOMeter, it also > seems to create a file on target disk and MAYBE fetch data from that file by > pre-defined I/O size and policy. If I am wrong? > If I am not wrong, then why there's so much difference. Joe, by > your experience, if my program has any big defect? You are ignoring the starting postion in your calls to set the file positon: pos = i * interval; <---- you need to change this to pos = sPos + (i * interval); fsetpos(fp,&pos); You'd also be better off hoisting you buffer allocation out of the for loop. Cheers, -Fab > 2) My major purpose is to have a program in our production env. ,which > will frequently fetch data from SSD, and there are also some additional > operations/work after data fetched - this is also why you see I put some > additional work after each I/O (such as memory allocation and de-allocation > in I/O calculation). > What I expect to see, its benchmark SHOULD be less than I/OMeter > benchmark. > > Would you advise more? Is there any big defect in my program for either > doing file I/O or I/O calculation? > > Thanks in advance!! > > Nai Yan. > > > > 2012/2/13 <jo...@ei...> > >> Manufacturer's quoted sequential MB/s won't be with 512byte reads. In >> Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That >> should come closer to the maximum throughput(I doubt you'll be able to get >> your laptop to actually get close to 520MB/s though). >> >> I'll see if I can find a windows system to try to compile/run your >> program, but I can't make any promises. >> >> >> Joe >> >> >> Quoting Nai yan zhao <zha...@gm...>: >> >>> Hello Joe, >>> Thank you again for your time! >>> It's wired that from IOMeter, the throughput for sequential IOPS >>> (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, >>> from that SSD official website, this SSD sequential throughput should be >>> around 510MB/s ( >>> http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD >>> is 128G). If there's any parameter I didn't set correctly in IOMeter? >>> >>> As you suggested, I try to create a 12GB sample file (my test bed >>> memory is 6GB and without RAID) and use 1 thread to do IO. The result >>> is 33666; However, with I/O meter, it's 11572 (throughput this time is >>> ONLY >>> 5.93MB/s); IOPS still 3 times!! >>> >>> I attach my IOMeter settings, if there's anything wrong? Also, I >>> attach my modified code. Joe, could you help again to see where's the >>> problem? >>> >>> Thank you so much!! >>> >>> Nai Yan. >>> >>> 2012/2/13 <jo...@ei...> >>> >>>> 82K sounds reasonable for iops on an SSD. You should check the specs of >>>> your drive to see what you should expect. >>>> >>>> You need to remember that you are doing file i/o so you have several >>>> layers of cache involved. think of it was file cache -> block cache -> >>>> controller cache -> drive cache (you aren't testing a HW RAID, so you >>>> probably don't have cache in you controller) My personal run of thumb >>>> for >>>> random I/O is to have my file size be about 3x my combined cache size. >>>> For >>>> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB >>>> I'd >>>> do a 16GB file. >>>> >>>> If in iometer you are accessing a PHYSICALDISK, then you are avoiding >>>> window's file cache. >>>> >>>> I just pulled up the code and (keep in mind I'm not much of a windows >>>> guy) >>>> something looks odd in your GetSecs routine. The cast to double is going >>>> to >>>> lose resolution, I think I would store the start/end times as >>>> LARGE_INTEGER. And you probably only have to call the frequency routine >>>> once >>>> >>>> Also windows used to have issues in the HAL where if a thread got moved >>>> to >>>> a different processor you'd get odd results. There is a Windows API call >>>> for setting affinity, similar to the linux sched_set_affinity. >>>> >>>> This doesn't really matter for what we are talking about, it is just a >>>> pet >>>> peeve of mine, your "delete c;" should be "delete [] c;" (are you >>>> intending >>>> tp be timing your allocator calls as well? you may be if you are >>>> simulating >>>> system performance, but typically for disk performance you'd try to >>>> preallocate as much as possible so your only timing the transfers) >>>> >>>> >>>> If it were me I would start with something simplier, (say single >>>> threaded >>>> sequential read) and see if your program gets the correct values then. >>>> You >>>> could also fire up windows performance monitor and try to correlate to >>>> its >>>> counts as well (PHYSICALDISK transfers/sec). >>>> >>>> Good Luck, >>>> >>>> Joe >>>> >>>> >>>> >>>> Quoting Nai yan zhao <zha...@gm...>: >>>> >>>> Hello Fabian and Joe, >>>>> >>>>> Thank you so much for your reply. >>>>> >>>>> Actually, what I am trying to do, is to split a file into 32 parts, >>>>> and each part will be assigned to a thread to read. Each thread each >>>>> time >>>>> to open file, read 512B, and close file. I was trying to avoid 2 read >>>>> I/Os >>>>> hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although >>>>> most >>>>> read I/Os are ordered but not >>>>> >>>>> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>>> >>>>> > >>>>> . >>>>> >>>>> >>>>> By your suggestion, I tried 512B sequential I/O with settings below, >>>>> >>>>> Max disk size - 8388608 >>>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>>> Transfer request size - 512B, >>>>> 100% sequential >>>>> Reply size - no reply >>>>> Align I/Os on - Sector boundaries >>>>> >>>>> The result is around 82K, still much slower than my program. >>>>> >>>>> If my program has any defect in calculating IOPS? Or if I have any >>>>> misunderstanding of caching of SSD or file system, which causes my >>>>> program >>>>> fetches data most from RAM of SSD? Or what parameters I should set in >>>>> I/O >>>>> meter to simulate my program I/O? >>>>> >>>>> Thank you again in advance for your time to help investigate it!! >>>>> >>>>> Nai Yan. >>>>> >>>>> 2012/2/11 Fabian Tillier <fa...@ti...> >>>>> >>>>> If I read the test correctly, all threads start at offset 0, and then >>>>>> >>>>>> perform 512b reads with a 1024b stride between reads. As Joe said, >>>>>> this is pretty much sequential reading, and all threads are reading >>>>>> the same data, so most are likely to be satisifed from cache, either >>>>>> in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>>>>> operations total each, so end up reading 20MB of the file. It's quite >>>>>> likely that the whole 20MB that you are reading will sit happilly in >>>>>> the file cache. >>>>>> >>>>>> Create an access pattern that mimics your app (512b sequential with >>>>>> 1024b stride), create 32 workers, and see if the results are similar. >>>>>> Best would be if you created a test file of 20MB, too. You can then >>>>>> see how things compare if you go with async I/O and a single thread. >>>>>> >>>>>> Cheers, >>>>>> -Fab >>>>>> >>>>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>>> > Forgive me if I missed it, but I don't see any randomization in your >>>>>> > file reads. >>>>>> > >>>>>> > It looks like you just skip ahead so thread 0 reads the first >>>>>> > 512bytes, thread 1 the next 512b. So any storage will be >>>>>> > prefetching >>>>>> > very effectively. >>>>>> > >>>>>> > Tell Iometer to do sequential instead of random and see how much >>>>>> > closer the numbers are. Or better yet, make your program randomize >>>>>> > its reads over the entire disk. >>>>>> > >>>>>> > Joe >>>>>> > >>>>>> > >>>>>> > Quoting Nai yan zhao <zha...@gm...>: >>>>>> > >>>>>> >> Greetings, >>>>>> >> Could anybody help me a little out of my difficulty? >>>>>> >> >>>>>> >> I have a SSD and I am trying to use it to simulate my program >>>>>> >> I/O >>>>>> >> performance, however, IOPS calculated from my program is much much >>>>>> faster >>>>>> >> than IOMeter. >>>>>> >> >>>>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random >>>>>> >> read >>>>>> >> IOPS is around 94k (queue depth is 32). >>>>>> >> However my program (32 windows threads) can reach around 500k >>>>>> 512B >>>>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>>>>> find >>>>>> >> any error in data fetching. It's because my data fetching in order? >>>>>> >> >>>>>> >> I paste my code belwo (it mainly fetch 512B from file and >>>>>> >> release >>>>>> it; >>>>>> >> I did use 4bytes (an int) to validate program logic and didn't find >>>>>> >> problem), can anybody help me figure out where I am wrong? >>>>>> >> >>>>>> >> Thanks so much in advance!! >>>>>> >> >>>>>> >> Nai Yan. >>>>>> >> >>>>>> >> #include <stdio.h> >>>>>> >> #include <Windows.h> >>>>>> >> /* >>>>>> >> ** Purpose: Verify file random read IOPS in comparison with >>>>>> >> IOMeter >>>>>> >> ** Author: Nai Yan >>>>>> >> ** Date: Feb. 9th, 2012 >>>>>> >> **/ >>>>>> >> //Global variables >>>>>> >> long completeIOs = 0; >>>>>> >> long completeBytes = 0; >>>>>> >> int threadCount = 32; >>>>>> >> unsigned long long length = 1073741824; //4G test >>>>>> file >>>>>> >> int interval = 1024; >>>>>> >> int resultArrayLen = 320000; >>>>>> >> int *result = new int[resultArrayLen]; >>>>>> >> //Method declarison >>>>>> >> double GetSecs(void); //Calculate out duration >>>>>> >> int InitPool(long long,char*,int); //Initialize test data >>>>>> >> for >>>>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>>>> >> int * FileRead(char * path); >>>>>> >> unsigned int DataVerification(int*, int sampleItem); >>>>>> >> //Verify data fetched from pool >>>>>> >> int main() >>>>>> >> { >>>>>> >> int sampleItem = 0x1; >>>>>> >> char * fPath = "G:\\workspace\\4G.bin"; >>>>>> >> unsigned int invalidIO = 0; >>>>>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>>> >>>>>> >> printf("File write err... \n"); >>>>>> >> //start do random I/Os from initialized file >>>>>> >> double start = GetSecs(); >>>>>> >> int * fetchResult = FileRead(fPath); >>>>>> >> double end = GetSecs(); >>>>>> >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>>>>> start)); >>>>>> >> //start data validation, for 4 bytes fetch only >>>>>> >> // invalidIO = DataVerification(fetchResult,**sampleItem); >>>>>> >>>>>> >> // if (invalidIO !=0) >>>>>> >> // { >>>>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); >>>>>> >> // } >>>>>> >> return 0; >>>>>> >> } >>>>>> >> >>>>>> >> >>>>>> >> int InitPool(long long length, char* path, int sample) >>>>>> >> { >>>>>> >> printf("Start initializing test data ... \n"); >>>>>> >> FILE * fp = fopen(path,"wb"); >>>>>> >> if (fp == NULL) >>>>>> >> { >>>>>> >> printf("file open err... \n"); >>>>>> >> exit (-1); >>>>>> >> } >>>>>> >> else //initialize file for testing >>>>>> >> { >>>>>> >> fseek(fp,0L,SEEK_SET); >>>>>> >> for (int i=0; i<length; i++) >>>>>> >> { >>>>>> >> fwrite(&sample,sizeof(int),1,**fp); >>>>>> >>>>>> >> } >>>>>> >> fclose(fp); >>>>>> >> fp = NULL; >>>>>> >> printf("Data initialization is complete...\n"); >>>>>> >> return 1; >>>>>> >> } >>>>>> >> } >>>>>> >> double GetSecs(void) >>>>>> >> { >>>>>> >> LARGE_INTEGER frequency; >>>>>> >> LARGE_INTEGER start; >>>>>> >> if(! QueryPerformanceFrequency(&**frequency)) >>>>>> >> printf("**QueryPerformanceFrequency Failed\n"); >>>>>> >> if(! QueryPerformanceCounter(&**start)) >>>>>> >> printf("**QueryPerformanceCounter Failed\n"); >>>>>> >> return ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>>> >>>>>> >> } >>>>>> >> class input >>>>>> >> { >>>>>> >> public: >>>>>> >> char *path; >>>>>> >> int starting; >>>>>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>>>>> >>>>>> >> }; >>>>>> >> //Workers >>>>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>>>>> >> { >>>>>> >> input * in = (input*) lpThreadParameter; >>>>>> >> char* path = in->path; >>>>>> >> FILE * fp = fopen(path,"rb"); >>>>>> >> int sPos = in->starting; >>>>>> >> // int * result = in->r; >>>>>> >> if(fp != NULL) >>>>>> >> { >>>>>> >> fpos_t pos; >>>>>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>>> >>>>>> >> { >>>>>> >> pos = i * interval; >>>>>> >> fsetpos(fp,&pos); >>>>>> >> //For 512 bytes fetch each time >>>>>> >> unsigned char *c =new unsigned char [512]; >>>>>> >> if (fread(c,512,1,fp) ==1) >>>>>> >> { >>>>>> >> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>> >> delete c; >>>>>> >> } >>>>>> >> //For 4 bytes fetch each time >>>>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>>>>> >> { >>>>>> >> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>> >> }*/ >>>>>> >> else >>>>>> >> { >>>>>> >> printf("file read err...\n"); >>>>>> >> exit(-1); >>>>>> >> } >>>>>> >> } >>>>>> >> fclose(fp); >>>>>> >> fp = NULL; >>>>>> >> } >>>>>> >> else >>>>>> >> { >>>>>> >> printf("File open err... \n"); >>>>>> >> exit(-1); >>>>>> >> } >>>>>> >> } >>>>>> >> int * FileRead(char * p) >>>>>> >> { >>>>>> >> printf("Starting reading file ... \n"); >>>>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>>>> >> completeIOs = 0; >>>>>> >> int slice = int (resultArrayLen/threadCount); >>>>>> >> for(int i = 0; i < threadCount; i++) >>>>>> >> { >>>>>> >> mWorkThread[i] = CreateThread( >>>>>> >> NULL, >>>>>> >> 0, >>>>>> >> FileReadThreadEntry, >>>>>> >> (LPVOID)(new input(i*slice,p)), >>>>>> >> 0, >>>>>> >> NULL); >>>>>> >> } >>>>>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, >>>>>> >> INFINITE); >>>>>> >>>>>> >> printf("File read complete... \n"); >>>>>> >> return result; >>>>>> >> } >>>>>> >> unsigned int DataVerification(int* result, int sampleItem) >>>>>> >> { >>>>>> >> unsigned int invalid = 0; >>>>>> >> for (int i=0; i< resultArrayLen/interval;i++) >>>>>> >> { >>>>>> >> if (result[i]!=sampleItem) >>>>>> >> { >>>>>> >> invalid ++; >>>>>> >> continue; >>>>>> >> } >>>>>> >> } >>>>>> >> return invalid; >>>>>> >> } >>>>>> >> >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>> ------------------------------**------------------------------** >>>>>> >>>>>> ------------------ >>>>>> > Virtualization & Cloud Management Using Capacity Planning >>>>>> > Cloud computing makes use of virtualization - but cloud computing >>>>>> > also focuses on allowing computing to be delivered as a service. >>>>>> > >>>>>> > http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>>>>> > ______________________________**_________________ >>>>>> > Iometer-devel mailing list >>>>>> > >>>>>> > Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>>>>> > >>>>>> > https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>>> >>>>>> >>>>> >>>> >>>> >>>> >>> >> >> >> > > > ------------------------------------------------------------------------------ > Try before you buy = See our experts in action! > The most comprehensive online learning library for Microsoft developers > is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, > Metro Style Apps, more. Free future releases when you subscribe now! > http://p.sf.net/sfu/learndevnow-dev2 > _______________________________________________ > Iometer-devel mailing list > Iom...@li... > https://lists.sourceforge.net/lists/listinfo/iometer-devel > |
From: Vedran D. <ve...@ya...> - 2012-02-13 06:20:57
|
Nai Yan, Your program does not specify anything about the file caching attributes, so the data is most likely coming out of the filesystem cache. And if you have a bunch of threads scanning through the same set of LBAs, the SSD cache might be helping as well. Try running 1 thread with 1 iteration right after boot and see what numbers you get. Regards, Ved >________________________________ >From: Nai yan zhao <zha...@gm...> >To: jo...@ei... >Cc: Iom...@li... >Sent: Sunday, February 12, 2012 6:59 PM >Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? > > >Hello Joe, > Again, thank you for your reply! I will take your suggestion and try again. But I am very looking forward to your further investigation on Windows system for my program. > > > I trust IOMeter, but I can't explain why and where's the problem with my program. And further speaking, would you give me some comments? > 1) What's the difference between IOmeter I/O calculation and my program (although it's much much simpler)? From the behavior of IOMeter, it also seems to create a file on target disk and MAYBE fetch data from that file by pre-defined I/O size and policy. If I am wrong? > If I am not wrong, then why there's so much difference. Joe, by your experience, if my program has any big defect? > > > 2) My major purpose is to have a program in our production env. ,which will frequently fetch data from SSD, and there are also some additional operations/work after data fetched - this is also why you see I put some additional work after each I/O (such as memory allocation and de-allocation in I/O calculation). > What I expect to see, its benchmark SHOULD be less than I/OMeter benchmark. > > > Would you advise more? Is there any big defect in my program for either doing file I/O or I/O calculation? > > > Thanks in advance!! > > >Nai Yan. > > > > > > >2012/2/13 <jo...@ei...> > >Manufacturer's quoted sequential MB/s won't be with 512byte reads. In Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That should come closer to the maximum throughput(I doubt you'll be able to get your laptop to actually get close to 520MB/s though). >> >>I'll see if I can find a windows system to try to compile/run your program, but I can't make any promises. >> >> >>Joe >> >> >>Quoting Nai yan zhao <zha...@gm...>: >> >> >>Hello Joe, >>> Thank you again for your time! >>> It's wired that from IOMeter, the throughput for sequential IOPS >>>(512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, >>>from that SSD official website, this SSD sequential throughput should be >>>around 510MB/s ( >>>http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD >>>is 128G). If there's any parameter I didn't set correctly in IOMeter? >>> >>> As you suggested, I try to create a 12GB sample file (my test bed >>>memory is 6GB and without RAID) and use 1 thread to do IO. The result >>>is 33666; However, with I/O meter, it's 11572 (throughput this time is ONLY >>>5.93MB/s); IOPS still 3 times!! >>> >>> I attach my IOMeter settings, if there's anything wrong? Also, I >>>attach my modified code. Joe, could you help again to see where's the >>>problem? >>> >>> Thank you so much!! >>> >>>Nai Yan. >>> >>>2012/2/13 <jo...@ei...> >>> >>> >>>82K sounds reasonable for iops on an SSD. You should check the specs of >>>>your drive to see what you should expect. >>>> >>>>You need to remember that you are doing file i/o so you have several >>>>layers of cache involved. think of it was file cache -> block cache -> >>>>controller cache -> drive cache (you aren't testing a HW RAID, so you >>>>probably don't have cache in you controller) My personal run of thumb for >>>>random I/O is to have my file size be about 3x my combined cache size. For >>>>example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd >>>>do a 16GB file. >>>> >>>>If in iometer you are accessing a PHYSICALDISK, then you are avoiding >>>>window's file cache. >>>> >>>>I just pulled up the code and (keep in mind I'm not much of a windows guy) >>>>something looks odd in your GetSecs routine. The cast to double is going to >>>>lose resolution, I think I would store the start/end times as >>>>LARGE_INTEGER. And you probably only have to call the frequency routine once >>>> >>>>Also windows used to have issues in the HAL where if a thread got moved to >>>>a different processor you'd get odd results. There is a Windows API call >>>>for setting affinity, similar to the linux sched_set_affinity. >>>> >>>>This doesn't really matter for what we are talking about, it is just a pet >>>>peeve of mine, your "delete c;" should be "delete [] c;" (are you intending >>>>tp be timing your allocator calls as well? you may be if you are simulating >>>>system performance, but typically for disk performance you'd try to >>>>preallocate as much as possible so your only timing the transfers) >>>> >>>> >>>>If it were me I would start with something simplier, (say single threaded >>>>sequential read) and see if your program gets the correct values then. You >>>>could also fire up windows performance monitor and try to correlate to its >>>>counts as well (PHYSICALDISK transfers/sec). >>>> >>>>Good Luck, >>>> >>>>Joe >>>> >>>> >>>> >>>>Quoting Nai yan zhao <zha...@gm...>: >>>> >>>> Hello Fabian and Joe, >>>> >>>> Thank you so much for your reply. >>>>> >>>>> Actually, what I am trying to do, is to split a file into 32 parts, >>>>>and each part will be assigned to a thread to read. Each thread each time >>>>>to open file, read 512B, and close file. I was trying to avoid 2 read >>>>>I/Os >>>>>hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most >>>>>read I/Os are ordered but not >>>>>contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>>> >>>>>> >>>>>. >>>>> >>>>> >>>>> By your suggestion, I tried 512B sequential I/O with settings below, >>>>> >>>>> Max disk size - 8388608 >>>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>>> Transfer request size - 512B, >>>>> 100% sequential >>>>> Reply size - no reply >>>>> Align I/Os on - Sector boundaries >>>>> >>>>> The result is around 82K, still much slower than my program. >>>>> >>>>> If my program has any defect in calculating IOPS? Or if I have any >>>>>misunderstanding of caching of SSD or file system, which causes my program >>>>>fetches data most from RAM of SSD? Or what parameters I should set in I/O >>>>>meter to simulate my program I/O? >>>>> >>>>> Thank you again in advance for your time to help investigate it!! >>>>> >>>>>Nai Yan. >>>>> >>>>>2012/2/11 Fabian Tillier <fa...@ti...> >>>>> >>>>> If I read the test correctly, all threads start at offset 0, and then >>>>> >>>>>perform 512b reads with a 1024b stride between reads. As Joe said, >>>>>>this is pretty much sequential reading, and all threads are reading >>>>>>the same data, so most are likely to be satisifed from cache, either >>>>>>in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>>>>>operations total each, so end up reading 20MB of the file. It's quite >>>>>>likely that the whole 20MB that you are reading will sit happilly in >>>>>>the file cache. >>>>>> >>>>>>Create an access pattern that mimics your app (512b sequential with >>>>>>1024b stride), create 32 workers, and see if the results are similar. >>>>>>Best would be if you created a test file of 20MB, too. You can then >>>>>>see how things compare if you go with async I/O and a single thread. >>>>>> >>>>>>Cheers, >>>>>>-Fab >>>>>> >>>>>>On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>>>> Forgive me if I missed it, but I don't see any randomization in your >>>>>>> file reads. >>>>>>> >>>>>>> It looks like you just skip ahead so thread 0 reads the first >>>>>>> 512bytes, thread 1 the next 512b. So any storage will be prefetching >>>>>>> very effectively. >>>>>>> >>>>>>> Tell Iometer to do sequential instead of random and see how much >>>>>>> closer the numbers are. Or better yet, make your program randomize >>>>>>> its reads over the entire disk. >>>>>>> >>>>>>> Joe >>>>>>> >>>>>>> >>>>>>> Quoting Nai yan zhao <zha...@gm...>: >>>>>>> >>>>>>>> Greetings, >>>>>>>> Could anybody help me a little out of my difficulty? >>>>>>>> >>>>>>>> I have a SSD and I am trying to use it to simulate my program I/O >>>>>>>> performance, however, IOPS calculated from my program is much much >>>>>>faster >>>>>>>> than IOMeter. >>>>>>>> >>>>>>>> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >>>>>>>> IOPS is around 94k (queue depth is 32). >>>>>>>> However my program (32 windows threads) can reach around 500k >>>>>>512B >>>>>>>> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>>>>>find >>>>>>>> any error in data fetching. It's because my data fetching in order? >>>>>>>> >>>>>>>> I paste my code belwo (it mainly fetch 512B from file and release >>>>>>it; >>>>>>>> I did use 4bytes (an int) to validate program logic and didn't find >>>>>>>> problem), can anybody help me figure out where I am wrong? >>>>>>>> >>>>>>>> Thanks so much in advance!! >>>>>>>> >>>>>>>> Nai Yan. >>>>>>>> >>>>>>>> #include <stdio.h> >>>>>>>> #include <Windows.h> >>>>>>>> /* >>>>>>>> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>>>>>>> ** Author: Nai Yan >>>>>>>> ** Date: Feb. 9th, 2012 >>>>>>>> **/ >>>>>>>> //Global variables >>>>>>>> long completeIOs = 0; >>>>>>>> long completeBytes = 0; >>>>>>>> int threadCount = 32; >>>>>>>> unsigned long long length = 1073741824; //4G test >>>>>>file >>>>>>>> int interval = 1024; >>>>>>>> int resultArrayLen = 320000; >>>>>>>> int *result = new int[resultArrayLen]; >>>>>>>> //Method declarison >>>>>>>> double GetSecs(void); //Calculate out duration >>>>>>>> int InitPool(long long,char*,int); //Initialize test data for >>>>>>>> testing, if successful, return 1; otherwise, return a non 1 value. >>>>>>>> int * FileRead(char * path); >>>>>>>> unsigned int DataVerification(int*, int sampleItem); >>>>>>>> //Verify data fetched from pool >>>>>>>> int main() >>>>>>>> { >>>>>>>> int sampleItem = 0x1; >>>>>>>> char * fPath = "G:\\workspace\\4G.bin"; >>>>>>>> unsigned int invalidIO = 0; >>>>>>>> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>>> >>>>>>>> printf("File write err... \n"); >>>>>>>> //start do random I/Os from initialized file >>>>>>>> double start = GetSecs(); >>>>>>>> int * fetchResult = FileRead(fPath); >>>>>>>> double end = GetSecs(); >>>>>>>> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>>>>>start)); >>>>>>>> //start data validation, for 4 bytes fetch only >>>>>>>> // invalidIO = DataVerification(fetchResult,**sampleItem); >>>>>> >>>>>>>> // if (invalidIO !=0) >>>>>>>> // { >>>>>>>> // printf("Total invalid data fetch IOs are %d", invalidIO); >>>>>>>> // } >>>>>>>> return 0; >>>>>>>> } >>>>>>>> >>>>>>>> >>>>>>>> int InitPool(long long length, char* path, int sample) >>>>>>>> { >>>>>>>> printf("Start initializing test data ... \n"); >>>>>>>> FILE * fp = fopen(path,"wb"); >>>>>>>> if (fp == NULL) >>>>>>>> { >>>>>>>> printf("file open err... \n"); >>>>>>>> exit (-1); >>>>>>>> } >>>>>>>> else //initialize file for testing >>>>>>>> { >>>>>>>> fseek(fp,0L,SEEK_SET); >>>>>>>> for (int i=0; i<length; i++) >>>>>>>> { >>>>>>>> fwrite(&sample,sizeof(int),1,**fp); >>>>>> >>>>>>>> } >>>>>>>> fclose(fp); >>>>>>>> fp = NULL; >>>>>>>> printf("Data initialization is complete...\n"); >>>>>>>> return 1; >>>>>>>> } >>>>>>>> } >>>>>>>> double GetSecs(void) >>>>>>>> { >>>>>>>> LARGE_INTEGER frequency; >>>>>>>> LARGE_INTEGER start; >>>>>>>> if(! QueryPerformanceFrequency(&**frequency)) >>>>>>>> printf("**QueryPerformanceFrequency Failed\n"); >>>>>>>> if(! QueryPerformanceCounter(&**start)) >>>>>>>> printf("**QueryPerformanceCounter Failed\n"); >>>>>>>> return ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>>> >>>>>>>> } >>>>>>>> class input >>>>>>>> { >>>>>>>> public: >>>>>>>> char *path; >>>>>>>> int starting; >>>>>>>> input (int st, char * filePath):starting(st),path(**filePath){} >>>>>> >>>>>>>> }; >>>>>>>> //Workers >>>>>>>> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>>>>>>> { >>>>>>>> input * in = (input*) lpThreadParameter; >>>>>>>> char* path = in->path; >>>>>>>> FILE * fp = fopen(path,"rb"); >>>>>>>> int sPos = in->starting; >>>>>>>> // int * result = in->r; >>>>>>>> if(fp != NULL) >>>>>>>> { >>>>>>>> fpos_t pos; >>>>>>>> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>>> >>>>>>>> { >>>>>>>> pos = i * interval; >>>>>>>> fsetpos(fp,&pos); >>>>>>>> //For 512 bytes fetch each time >>>>>>>> unsigned char *c =new unsigned char [512]; >>>>>>>> if (fread(c,512,1,fp) ==1) >>>>>>>> { >>>>>>>> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>>>> delete c; >>>>>>>> } >>>>>>>> //For 4 bytes fetch each time >>>>>>>> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>>>>>>> { >>>>>>>> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>>>> }*/ >>>>>>>> else >>>>>>>> { >>>>>>>> printf("file read err...\n"); >>>>>>>> exit(-1); >>>>>>>> } >>>>>>>> } >>>>>>>> fclose(fp); >>>>>>>> fp = NULL; >>>>>>>> } >>>>>>>> else >>>>>>>> { >>>>>>>> printf("File open err... \n"); >>>>>>>> exit(-1); >>>>>>>> } >>>>>>>> } >>>>>>>> int * FileRead(char * p) >>>>>>>> { >>>>>>>> printf("Starting reading file ... \n"); >>>>>>>> HANDLE mWorkThread[256]; //max 256 threads >>>>>>>> completeIOs = 0; >>>>>>>> int slice = int (resultArrayLen/threadCount); >>>>>>>> for(int i = 0; i < threadCount; i++) >>>>>>>> { >>>>>>>> mWorkThread[i] = CreateThread( >>>>>>>> NULL, >>>>>>>> 0, >>>>>>>> FileReadThreadEntry, >>>>>>>> (LPVOID)(new input(i*slice,p)), >>>>>>>> 0, >>>>>>>> NULL); >>>>>>>> } >>>>>>>> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, INFINITE); >>>>>> >>>>>>>> printf("File read complete... \n"); >>>>>>>> return result; >>>>>>>> } >>>>>>>> unsigned int DataVerification(int* result, int sampleItem) >>>>>>>> { >>>>>>>> unsigned int invalid = 0; >>>>>>>> for (int i=0; i< resultArrayLen/interval;i++) >>>>>>>> { >>>>>>>> if (result[i]!=sampleItem) >>>>>>>> { >>>>>>>> invalid ++; >>>>>>>> continue; >>>>>>>> } >>>>>>>> } >>>>>>>> return invalid; >>>>>>>> } >>>>>>>> >>>>>>> >>>>>>> >>>>>>> >>>>>>> >>>>>>> >>>>>>------------------------------**------------------------------** >>>>>> >>>>>>------------------ >>>>>>> Virtualization & Cloud Management Using Capacity Planning >>>>>>> Cloud computing makes use of virtualization - but cloud computing >>>>>>> also focuses on allowing computing to be delivered as a service. >>>>>>> http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>>>>>> ______________________________**_________________ >>>>>>> Iometer-devel mailing list >>>>>>> Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>>>>>> https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>>> >>>>>> >>>>>> >>>>> >>>> >>>> >>>> >>> >> >> >> > >------------------------------------------------------------------------------ >Try before you buy = See our experts in action! >The most comprehensive online learning library for Microsoft developers >is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, >Metro Style Apps, more. Free future releases when you subscribe now! >http://p.sf.net/sfu/learndevnow-dev2 >_______________________________________________ >Iometer-devel mailing list >Iom...@li... >https://lists.sourceforge.net/lists/listinfo/iometer-devel > > > |
From: Nai y. z. <zha...@gm...> - 2012-02-14 01:35:13
|
Hello Vedran, Thank you for your time to reply. However it seems filesystem cache is not the cause for this problem. I tried 2 times run after boot, and each is around 28K (although a little less than yesterday). Regarding SSD, I was trying to ask each read I/O to access different LBA to avoid to hit SSD cache. Any further suggestion? Thanks! Nai Yan. 2012/2/13 Vedran Degoricija <ve...@ya...> > Nai Yan, > > Your program does not specify anything about the file caching attributes, > so the data is most likely coming out of the filesystem cache. And if you > have a bunch of threads scanning through the same set of LBAs, the SSD > cache might be helping as well. > > Try running 1 thread with 1 iteration right after boot and see what > numbers you get. > > Regards, > Ved > > *From:* Nai yan zhao <zha...@gm...> > *To:* jo...@ei... > *Cc:* Iom...@li... > *Sent:* Sunday, February 12, 2012 6:59 PM > > *Subject:* Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is > much slower than windows multi-threading data fetch IOPS? > > Hello Joe, > Again, thank you for your reply! I will take your suggestion and try > again. But I am very looking forward to your further investigation on > Windows system for my program. > > I trust IOMeter, but I can't explain why and where's the problem with > my program. And further speaking, would you give me some comments? > 1) What's the difference between IOmeter I/O calculation and my > program (although it's much much simpler)? From the behavior of IOMeter, it > also seems to create a file on target disk and MAYBE fetch data from that > file by pre-defined I/O size and policy. If I am wrong? > If I am not wrong, then why there's so much difference. Joe, by > your experience, if my program has any big defect? > > 2) My major purpose is to have a program in our production env. > ,which will frequently fetch data from SSD, and there are also some > additional operations/work after data fetched - this is also why you see I > put some additional work after each I/O (such as memory allocation and > de-allocation in I/O calculation). > What I expect to see, its benchmark SHOULD be less than I/OMeter > benchmark. > > Would you advise more? Is there any big defect in my program for > either doing file I/O or I/O calculation? > > Thanks in advance!! > > Nai Yan. > > > > 2012/2/13 <jo...@ei...> > > Manufacturer's quoted sequential MB/s won't be with 512byte reads. In > Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That > should come closer to the maximum throughput(I doubt you'll be able to get > your laptop to actually get close to 520MB/s though). > > I'll see if I can find a windows system to try to compile/run your > program, but I can't make any promises. > > > Joe > > > Quoting Nai yan zhao <zha...@gm...>: > > Hello Joe, > Thank you again for your time! > It's wired that from IOMeter, the throughput for sequential IOPS > (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, > from that SSD official website, this SSD sequential throughput should be > around 510MB/s ( > http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD > is 128G). If there's any parameter I didn't set correctly in IOMeter? > > As you suggested, I try to create a 12GB sample file (my test bed > memory is 6GB and without RAID) and use 1 thread to do IO. The result > is 33666; However, with I/O meter, it's 11572 (throughput this time is ONLY > 5.93MB/s); IOPS still 3 times!! > > I attach my IOMeter settings, if there's anything wrong? Also, I > attach my modified code. Joe, could you help again to see where's the > problem? > > Thank you so much!! > > Nai Yan. > > 2012/2/13 <jo...@ei...> > > 82K sounds reasonable for iops on an SSD. You should check the specs of > your drive to see what you should expect. > > You need to remember that you are doing file i/o so you have several > layers of cache involved. think of it was file cache -> block cache -> > controller cache -> drive cache (you aren't testing a HW RAID, so you > probably don't have cache in you controller) My personal run of thumb for > random I/O is to have my file size be about 3x my combined cache size. For > example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd > do a 16GB file. > > If in iometer you are accessing a PHYSICALDISK, then you are avoiding > window's file cache. > > I just pulled up the code and (keep in mind I'm not much of a windows guy) > something looks odd in your GetSecs routine. The cast to double is going to > lose resolution, I think I would store the start/end times as > LARGE_INTEGER. And you probably only have to call the frequency routine > once > > Also windows used to have issues in the HAL where if a thread got moved to > a different processor you'd get odd results. There is a Windows API call > for setting affinity, similar to the linux sched_set_affinity. > > This doesn't really matter for what we are talking about, it is just a pet > peeve of mine, your "delete c;" should be "delete [] c;" (are you intending > tp be timing your allocator calls as well? you may be if you are simulating > system performance, but typically for disk performance you'd try to > preallocate as much as possible so your only timing the transfers) > > > If it were me I would start with something simplier, (say single threaded > sequential read) and see if your program gets the correct values then. You > could also fire up windows performance monitor and try to correlate to its > counts as well (PHYSICALDISK transfers/sec). > > Good Luck, > > Joe > > > > Quoting Nai yan zhao <zha...@gm...>: > > Hello Fabian and Joe, > > Thank you so much for your reply. > > Actually, what I am trying to do, is to split a file into 32 parts, > and each part will be assigned to a thread to read. Each thread each time > to open file, read 512B, and close file. I was trying to avoid 2 read > I/Os > hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most > read I/Os are ordered but not > contiguous<http://en.**wikiped**ia.org/wiki/Contiguity#**** > Computer_science <http://wikipedia.org/wiki/Contiguity#**Computer_science> > <http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science>> > > > > > . > > > By your suggestion, I tried 512B sequential I/O with settings below, > > Max disk size - 8388608 > # of Outstanding I/O - 32 (for 64, it's also around 82K) > Transfer request size - 512B, > 100% sequential > Reply size - no reply > Align I/Os on - Sector boundaries > > The result is around 82K, still much slower than my program. > > If my program has any defect in calculating IOPS? Or if I have any > misunderstanding of caching of SSD or file system, which causes my program > fetches data most from RAM of SSD? Or what parameters I should set in I/O > meter to simulate my program I/O? > > Thank you again in advance for your time to help investigate it!! > > Nai Yan. > > 2012/2/11 Fabian Tillier <fa...@ti...> > > If I read the test correctly, all threads start at offset 0, and then > > perform 512b reads with a 1024b stride between reads. As Joe said, > this is pretty much sequential reading, and all threads are reading > the same data, so most are likely to be satisifed from cache, either > in the OS or on the SSD itself. They'll do 320000/16=20000 IO > operations total each, so end up reading 20MB of the file. It's quite > likely that the whole 20MB that you are reading will sit happilly in > the file cache. > > Create an access pattern that mimics your app (512b sequential with > 1024b stride), create 32 workers, and see if the results are similar. > Best would be if you created a test file of 20MB, too. You can then > see how things compare if you go with async I/O and a single thread. > > Cheers, > -Fab > > On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: > > Forgive me if I missed it, but I don't see any randomization in your > > file reads. > > > > It looks like you just skip ahead so thread 0 reads the first > > 512bytes, thread 1 the next 512b. So any storage will be prefetching > > very effectively. > > > > Tell Iometer to do sequential instead of random and see how much > > closer the numbers are. Or better yet, make your program randomize > > its reads over the entire disk. > > > > Joe > > > > > > Quoting Nai yan zhao <zha...@gm...>: > > > >> Greetings, > >> Could anybody help me a little out of my difficulty? > >> > >> I have a SSD and I am trying to use it to simulate my program I/O > >> performance, however, IOPS calculated from my program is much much > faster > >> than IOMeter. > >> > >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read > >> IOPS is around 94k (queue depth is 32). > >> However my program (32 windows threads) can reach around 500k > 512B > >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't > find > >> any error in data fetching. It's because my data fetching in order? > >> > >> I paste my code belwo (it mainly fetch 512B from file and release > it; > >> I did use 4bytes (an int) to validate program logic and didn't find > >> problem), can anybody help me figure out where I am wrong? > >> > >> Thanks so much in advance!! > >> > >> Nai Yan. > >> > >> #include <stdio.h> > >> #include <Windows.h> > >> /* > >> ** Purpose: Verify file random read IOPS in comparison with IOMeter > >> ** Author: Nai Yan > >> ** Date: Feb. 9th, 2012 > >> **/ > >> //Global variables > >> long completeIOs = 0; > >> long completeBytes = 0; > >> int threadCount = 32; > >> unsigned long long length = 1073741824; //4G test > file > >> int interval = 1024; > >> int resultArrayLen = 320000; > >> int *result = new int[resultArrayLen]; > >> //Method declarison > >> double GetSecs(void); //Calculate out duration > >> int InitPool(long long,char*,int); //Initialize test data for > >> testing, if successful, return 1; otherwise, return a non 1 value. > >> int * FileRead(char * path); > >> unsigned int DataVerification(int*, int sampleItem); > >> //Verify data fetched from pool > >> int main() > >> { > >> int sampleItem = 0x1; > >> char * fPath = "G:\\workspace\\4G.bin"; > >> unsigned int invalidIO = 0; > >> if (InitPool(length,fPath,****sampleItem)!= 1) > > >> printf("File write err... \n"); > >> //start do random I/Os from initialized file > >> double start = GetSecs(); > >> int * fetchResult = FileRead(fPath); > >> double end = GetSecs(); > >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - > start)); > >> //start data validation, for 4 bytes fetch only > >> // invalidIO = DataVerification(fetchResult,****sampleItem); > > >> // if (invalidIO !=0) > >> // { > >> // printf("Total invalid data fetch IOs are %d", invalidIO); > >> // } > >> return 0; > >> } > >> > >> > >> int InitPool(long long length, char* path, int sample) > >> { > >> printf("Start initializing test data ... \n"); > >> FILE * fp = fopen(path,"wb"); > >> if (fp == NULL) > >> { > >> printf("file open err... \n"); > >> exit (-1); > >> } > >> else //initialize file for testing > >> { > >> fseek(fp,0L,SEEK_SET); > >> for (int i=0; i<length; i++) > >> { > >> fwrite(&sample,sizeof(int),1,****fp); > > >> } > >> fclose(fp); > >> fp = NULL; > >> printf("Data initialization is complete...\n"); > >> return 1; > >> } > >> } > >> double GetSecs(void) > >> { > >> LARGE_INTEGER frequency; > >> LARGE_INTEGER start; > >> if(! QueryPerformanceFrequency(&****frequency)) > >> printf("****QueryPerformanceFrequency Failed\n"); > >> if(! QueryPerformanceCounter(&****start)) > >> printf("****QueryPerformanceCounter Failed\n"); > >> return ((double)start.QuadPart/(****double)frequency.QuadPart); > > >> } > >> class input > >> { > >> public: > >> char *path; > >> int starting; > >> input (int st, char * filePath):starting(st),path(****filePath){} > > >> }; > >> //Workers > >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) > >> { > >> input * in = (input*) lpThreadParameter; > >> char* path = in->path; > >> FILE * fp = fopen(path,"rb"); > >> int sPos = in->starting; > >> // int * result = in->r; > >> if(fp != NULL) > >> { > >> fpos_t pos; > >> for (int i=0; i<resultArrayLen/threadCount;****i++) > > >> { > >> pos = i * interval; > >> fsetpos(fp,&pos); > >> //For 512 bytes fetch each time > >> unsigned char *c =new unsigned char [512]; > >> if (fread(c,512,1,fp) ==1) > >> { > >> InterlockedIncrement(&****completeIOs); > > >> delete c; > >> } > >> //For 4 bytes fetch each time > >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) > >> { > >> InterlockedIncrement(&****completeIOs); > > >> }*/ > >> else > >> { > >> printf("file read err...\n"); > >> exit(-1); > >> } > >> } > >> fclose(fp); > >> fp = NULL; > >> } > >> else > >> { > >> printf("File open err... \n"); > >> exit(-1); > >> } > >> } > >> int * FileRead(char * p) > >> { > >> printf("Starting reading file ... \n"); > >> HANDLE mWorkThread[256]; //max 256 threads > >> completeIOs = 0; > >> int slice = int (resultArrayLen/threadCount); > >> for(int i = 0; i < threadCount; i++) > >> { > >> mWorkThread[i] = CreateThread( > >> NULL, > >> 0, > >> FileReadThreadEntry, > >> (LPVOID)(new input(i*slice,p)), > >> 0, > >> NULL); > >> } > >> WaitForMultipleObjects(****threadCount, mWorkThread, TRUE, > INFINITE); > > >> printf("File read complete... \n"); > >> return result; > >> } > >> unsigned int DataVerification(int* result, int sampleItem) > >> { > >> unsigned int invalid = 0; > >> for (int i=0; i< resultArrayLen/interval;i++) > >> { > >> if (result[i]!=sampleItem) > >> { > >> invalid ++; > >> continue; > >> } > >> } > >> return invalid; > >> } > >> > > > > > > > > > > > ------------------------------****----------------------------**--** > > ------------------ > > Virtualization & Cloud Management Using Capacity Planning > > Cloud computing makes use of virtualization - but cloud computing > > also focuses on allowing computing to be delivered as a service. > > http://www.accelacomm.com/jaw/****sfnl/114/51521223/<http://www.accelacomm.com/jaw/**sfnl/114/51521223/> > <http://**www.accelacomm.com/jaw/sfnl/**114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> > > > > ______________________________****_________________ > > Iometer-devel mailing list > > Iometer-devel@lists.**sourcefo**rge.net <http://sourceforge.net/>< > Iometer-devel@lists.**sourceforge.net<Iom...@li...> > > > > https://lists.sourceforge.net/****lists/listinfo/iometer-devel<https://lists.sourceforge.net/**lists/listinfo/iometer-devel> > **<https://lists.sourceforge.**net/lists/listinfo/iometer-**devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> > > > > > > > > > > > > > > > > ------------------------------------------------------------------------------ > Try before you buy = See our experts in action! > The most comprehensive online learning library for Microsoft developers > is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, > Metro Style Apps, more. Free future releases when you subscribe now! > http://p.sf.net/sfu/learndevnow-dev2 > > _______________________________________________ > Iometer-devel mailing list > Iom...@li... > https://lists.sourceforge.net/lists/listinfo/iometer-devel > > > |
From: Vedran D. <ve...@ya...> - 2012-02-14 05:46:19
|
Hi Nai Yan, You need to double check your file access pattern/stride as the others have suggested. I'd also validate your file flags since after your initialization phase, a good portion of your file will be in the cache. What does Iometer report in the 28K IOPs case, and what are the workload parameters you are using? Good luck, Ved >________________________________ >From: Nai yan zhao <zha...@gm...> >To: ve...@ya... >Cc: "Iom...@li..." <iom...@li...> >Sent: Monday, February 13, 2012 5:35 PM >Subject: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? > > >Hello Vedran, > Thank you for your time to reply. However it seems filesystem cache is not the cause for this problem. I tried 2 times run after boot, and each is around 28K (although a little less than yesterday). Regarding SSD, I was trying to ask each read I/O to access different LBA to avoid to hit SSD cache. > > > Any further suggestion? Thanks! > > >Nai Yan. > > > > > >2012/2/13 Vedran Degoricija <ve...@ya...> > >Nai Yan, >> >>Your program does not specify anything about the file caching attributes, so the data is most likely coming out of the filesystem cache. And if you have a bunch of threads scanning through the same set of LBAs, the SSD cache might be helping as well. >> >>Try running 1 thread with 1 iteration right after boot and see what numbers you get. >> >>Regards, >>Ved >> >> >>From: Nai yan zhao <zha...@gm...> >>>To: jo...@ei... >>>Cc: Iom...@li... >>>Sent: Sunday, February 12, 2012 6:59 PM >>> >>>Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? >>> >>> >>>Hello Joe, >>> Again, thank you for your reply! I will take your suggestion and try again. But I am very looking forward to your further investigation on Windows system for my program. >>> >>> >>> I trust IOMeter, but I can't explain why and where's the problem with my program. And further speaking, would you give me some comments? >>> 1) What's the difference between IOmeter I/O calculation and my program (although it's much much simpler)? From the behavior of IOMeter, it also seems to create a file on target disk and MAYBE fetch data from that file by pre-defined I/O size and policy. If I am wrong? >>> If I am not wrong, then why there's so much difference. Joe, by your experience, if my program has any big defect? >>> >>> >>> 2) My major purpose is to have a program in our production env. ,which will frequently fetch data from SSD, and there are also some additional operations/work after data fetched - this is also why you see I put some additional work after each I/O (such as memory allocation and de-allocation in I/O calculation). >>> What I expect to see, its benchmark SHOULD be less than I/OMeter benchmark. >>> >>> >>> Would you advise more? Is there any big defect in my program for either doing file I/O or I/O calculation? >>> >>> >>> Thanks in advance!! >>> >>> >>>Nai Yan. >>> >>> >>> >>> >>> >>> >>>2012/2/13 <jo...@ei...> >>> >>>Manufacturer's quoted sequential MB/s won't be with 512byte reads. In Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That should come closer to the maximum throughput(I doubt you'll be able to get your laptop to actually get close to 520MB/s though). >>>> >>>>I'll see if I can find a windows system to try to compile/run your program, but I can't make any promises. >>>> >>>> >>>>Joe >>>> >>>> >>>>Quoting Nai yan zhao <zha...@gm...>: >>>> >>>> >>>>Hello Joe, >>>>> Thank you again for your time! >>>>> It's wired that from IOMeter, the throughput for sequential IOPS >>>>>(512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, >>>>>from that SSD official website, this SSD sequential throughput should be >>>>>around 510MB/s ( >>>>>http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD >>>>>is 128G). If there's any parameter I didn't set correctly in IOMeter? >>>>> >>>>> As you suggested, I try to create a 12GB sample file (my test bed >>>>>memory is 6GB and without RAID) and use 1 thread to do IO. The result >>>>>is 33666; However, with I/O meter, it's 11572 (throughput this time is ONLY >>>>>5.93MB/s); IOPS still 3 times!! >>>>> >>>>> I attach my IOMeter settings, if there's anything wrong? Also, I >>>>>attach my modified code. Joe, could you help again to see where's the >>>>>problem? >>>>> >>>>> Thank you so much!! >>>>> >>>>>Nai Yan. >>>>> >>>>>2012/2/13 <jo...@ei...> >>>>> >>>>> >>>>>82K sounds reasonable for iops on an SSD. You should check the specs of >>>>>>your drive to see what you should expect. >>>>>> >>>>>>You need to remember that you are doing file i/o so you have several >>>>>>layers of cache involved. think of it was file cache -> block cache -> >>>>>>controller cache -> drive cache (you aren't testing a HW RAID, so you >>>>>>probably don't have cache in you controller) My personal run of thumb for >>>>>>random I/O is to have my file size be about 3x my combined cache size. For >>>>>>example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd >>>>>>do a 16GB file. >>>>>> >>>>>>If in iometer you are accessing a PHYSICALDISK, then you are avoiding >>>>>>window's file cache. >>>>>> >>>>>>I just pulled up the code and (keep in mind I'm not much of a windows guy) >>>>>>something looks odd in your GetSecs routine. The cast to double is going to >>>>>>lose resolution, I think I would store the start/end times as >>>>>>LARGE_INTEGER. And you probably only have to call the frequency routine once >>>>>> >>>>>>Also windows used to have issues in the HAL where if a thread got moved to >>>>>>a different processor you'd get odd results. There is a Windows API call >>>>>>for setting affinity, similar to the linux sched_set_affinity. >>>>>> >>>>>>This doesn't really matter for what we are talking about, it is just a pet >>>>>>peeve of mine, your "delete c;" should be "delete [] c;" (are you intending >>>>>>tp be timing your allocator calls as well? you may be if you are simulating >>>>>>system performance, but typically for disk performance you'd try to >>>>>>preallocate as much as possible so your only timing the transfers) >>>>>> >>>>>> >>>>>>If it were me I would start with something simplier, (say single threaded >>>>>>sequential read) and see if your program gets the correct values then. You >>>>>>could also fire up windows performance monitor and try to correlate to its >>>>>>counts as well (PHYSICALDISK transfers/sec). >>>>>> >>>>>>Good Luck, >>>>>> >>>>>>Joe >>>>>> >>>>>> >>>>>> >>>>>>Quoting Nai yan zhao <zha...@gm...>: >>>>>> >>>>>> Hello Fabian and Joe, >>>>>> >>>>>> Thank you so much for your reply. >>>>>>> >>>>>>> Actually, what I am trying to do, is to split a file into 32 parts, >>>>>>>and each part will be assigned to a thread to read. Each thread each time >>>>>>>to open file, read 512B, and close file. I was trying to avoid 2 read >>>>>>>I/Os >>>>>>>hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most >>>>>>>read I/Os are ordered but not >>>>>>>contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>>>>> >>>>>>>> >>>>>>>. >>>>>>> >>>>>>> >>>>>>> By your suggestion, I tried 512B sequential I/O with settings below, >>>>>>> >>>>>>> Max disk size - 8388608 >>>>>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>>>>> Transfer request size - 512B, >>>>>>> 100% sequential >>>>>>> Reply size - no reply >>>>>>> Align I/Os on - Sector boundaries >>>>>>> >>>>>>> The result is around 82K, still much slower than my program. >>>>>>> >>>>>>> If my program has any defect in calculating IOPS? Or if I have any >>>>>>>misunderstanding of caching of SSD or file system, which causes my program >>>>>>>fetches data most from RAM of SSD? Or what parameters I should set in I/O >>>>>>>meter to simulate my program I/O? >>>>>>> >>>>>>> Thank you again in advance for your time to help investigate it!! >>>>>>> >>>>>>>Nai Yan. >>>>>>> >>>>>>>2012/2/11 Fabian Tillier <fa...@ti...> >>>>>>> >>>>>>> If I read the test correctly, all threads start at offset 0, and then >>>>>>> >>>>>>>perform 512b reads with a 1024b stride between reads. As Joe said, >>>>>>>>this is pretty much sequential reading, and all threads are reading >>>>>>>>the same data, so most are likely to be satisifed from cache, either >>>>>>>>in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>>>>>>>operations total each, so end up reading 20MB of the file. It's quite >>>>>>>>likely that the whole 20MB that you are reading will sit happilly in >>>>>>>>the file cache. >>>>>>>> >>>>>>>>Create an access pattern that mimics your app (512b sequential with >>>>>>>>1024b stride), create 32 workers, and see if the results are similar. >>>>>>>>Best would be if you created a test file of 20MB, too. You can then >>>>>>>>see how things compare if you go with async I/O and a single thread. >>>>>>>> >>>>>>>>Cheers, >>>>>>>>-Fab >>>>>>>> >>>>>>>>On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>>>>>> Forgive me if I missed it, but I don't see any randomization in your >>>>>>>>> file reads. >>>>>>>>> >>>>>>>>> It looks like you just skip ahead so thread 0 reads the first >>>>>>>>> 512bytes, thread 1 the next 512b. So any storage will be prefetching >>>>>>>>> very effectively. >>>>>>>>> >>>>>>>>> Tell Iometer to do sequential instead of random and see how much >>>>>>>>> closer the numbers are. Or better yet, make your program randomize >>>>>>>>> its reads over the entire disk. >>>>>>>>> >>>>>>>>> Joe >>>>>>>>> >>>>>>>>> >>>>>>>>> Quoting Nai yan zhao <zha...@gm...>: >>>>>>>>> >>>>>>>>>> Greetings, >>>>>>>>>> Could anybody help me a little out of my difficulty? >>>>>>>>>> >>>>>>>>>> I have a SSD and I am trying to use it to simulate my program I/O >>>>>>>>>> performance, however, IOPS calculated from my program is much much >>>>>>>>faster >>>>>>>>>> than IOMeter. >>>>>>>>>> >>>>>>>>>> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >>>>>>>>>> IOPS is around 94k (queue depth is 32). >>>>>>>>>> However my program (32 windows threads) can reach around 500k >>>>>>>>512B >>>>>>>>>> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>>>>>>>find >>>>>>>>>> any error in data fetching. It's because my data fetching in order? >>>>>>>>>> >>>>>>>>>> I paste my code belwo (it mainly fetch 512B from file and release >>>>>>>>it; >>>>>>>>>> I did use 4bytes (an int) to validate program logic and didn't find >>>>>>>>>> problem), can anybody help me figure out where I am wrong? >>>>>>>>>> >>>>>>>>>> Thanks so much in advance!! >>>>>>>>>> >>>>>>>>>> Nai Yan. >>>>>>>>>> >>>>>>>>>> #include <stdio.h> >>>>>>>>>> #include <Windows.h> >>>>>>>>>> /* >>>>>>>>>> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>>>>>>>>> ** Author: Nai Yan >>>>>>>>>> ** Date: Feb. 9th, 2012 >>>>>>>>>> **/ >>>>>>>>>> //Global variables >>>>>>>>>> long completeIOs = 0; >>>>>>>>>> long completeBytes = 0; >>>>>>>>>> int threadCount = 32; >>>>>>>>>> unsigned long long length = 1073741824; //4G test >>>>>>>>file >>>>>>>>>> int interval = 1024; >>>>>>>>>> int resultArrayLen = 320000; >>>>>>>>>> int *result = new int[resultArrayLen]; >>>>>>>>>> //Method declarison >>>>>>>>>> double GetSecs(void); //Calculate out duration >>>>>>>>>> int InitPool(long long,char*,int); //Initialize test data for >>>>>>>>>> testing, if successful, return 1; otherwise, return a non 1 value. >>>>>>>>>> int * FileRead(char * path); >>>>>>>>>> unsigned int DataVerification(int*, int sampleItem); >>>>>>>>>> //Verify data fetched from pool >>>>>>>>>> int main() >>>>>>>>>> { >>>>>>>>>> int sampleItem = 0x1; >>>>>>>>>> char * fPath = "G:\\workspace\\4G.bin"; >>>>>>>>>> unsigned int invalidIO = 0; >>>>>>>>>> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>>>>> >>>>>>>>>> printf("File write err... \n"); >>>>>>>>>> //start do random I/Os from initialized file >>>>>>>>>> double start = GetSecs(); >>>>>>>>>> int * fetchResult = FileRead(fPath); >>>>>>>>>> double end = GetSecs(); >>>>>>>>>> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>>>>>>>start)); >>>>>>>>>> //start data validation, for 4 bytes fetch only >>>>>>>>>> // invalidIO = DataVerification(fetchResult,**sampleItem); >>>>>>>> >>>>>>>>>> // if (invalidIO !=0) >>>>>>>>>> // { >>>>>>>>>> // printf("Total invalid data fetch IOs are %d", invalidIO); >>>>>>>>>> // } >>>>>>>>>> return 0; >>>>>>>>>> } >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> int InitPool(long long length, char* path, int sample) >>>>>>>>>> { >>>>>>>>>> printf("Start initializing test data ... \n"); >>>>>>>>>> FILE * fp = fopen(path,"wb"); >>>>>>>>>> if (fp == NULL) >>>>>>>>>> { >>>>>>>>>> printf("file open err... \n"); >>>>>>>>>> exit (-1); >>>>>>>>>> } >>>>>>>>>> else //initialize file for testing >>>>>>>>>> { >>>>>>>>>> fseek(fp,0L,SEEK_SET); >>>>>>>>>> for (int i=0; i<length; i++) >>>>>>>>>> { >>>>>>>>>> fwrite(&sample,sizeof(int),1,**fp); >>>>>>>> >>>>>>>>>> } >>>>>>>>>> fclose(fp); >>>>>>>>>> fp = NULL; >>>>>>>>>> printf("Data initialization is complete...\n"); >>>>>>>>>> return 1; >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> double GetSecs(void) >>>>>>>>>> { >>>>>>>>>> LARGE_INTEGER frequency; >>>>>>>>>> LARGE_INTEGER start; >>>>>>>>>> if(! QueryPerformanceFrequency(&**frequency)) >>>>>>>>>> printf("**QueryPerformanceFrequency Failed\n"); >>>>>>>>>> if(! QueryPerformanceCounter(&**start)) >>>>>>>>>> printf("**QueryPerformanceCounter Failed\n"); >>>>>>>>>> return ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>>>>> >>>>>>>>>> } >>>>>>>>>> class input >>>>>>>>>> { >>>>>>>>>> public: >>>>>>>>>> char *path; >>>>>>>>>> int starting; >>>>>>>>>> input (int st, char * filePath):starting(st),path(**filePath){} >>>>>>>> >>>>>>>>>> }; >>>>>>>>>> //Workers >>>>>>>>>> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>>>>>>>>> { >>>>>>>>>> input * in = (input*) lpThreadParameter; >>>>>>>>>> char* path = in->path; >>>>>>>>>> FILE * fp = fopen(path,"rb"); >>>>>>>>>> int sPos = in->starting; >>>>>>>>>> // int * result = in->r; >>>>>>>>>> if(fp != NULL) >>>>>>>>>> { >>>>>>>>>> fpos_t pos; >>>>>>>>>> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>>>>> >>>>>>>>>> { >>>>>>>>>> pos = i * interval; >>>>>>>>>> fsetpos(fp,&pos); >>>>>>>>>> //For 512 bytes fetch each time >>>>>>>>>> unsigned char *c =new unsigned char [512]; >>>>>>>>>> if (fread(c,512,1,fp) ==1) >>>>>>>>>> { >>>>>>>>>> InterlockedIncrement(&**completeIOs); >>>>>>>> >>>>>>>>>> delete c; >>>>>>>>>> } >>>>>>>>>> //For 4 bytes fetch each time >>>>>>>>>> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>>>>>>>>> { >>>>>>>>>> InterlockedIncrement(&**completeIOs); >>>>>>>> >>>>>>>>>> }*/ >>>>>>>>>> else >>>>>>>>>> { >>>>>>>>>> printf("file read err...\n"); >>>>>>>>>> exit(-1); >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> fclose(fp); >>>>>>>>>> fp = NULL; >>>>>>>>>> } >>>>>>>>>> else >>>>>>>>>> { >>>>>>>>>> printf("File open err... \n"); >>>>>>>>>> exit(-1); >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> int * FileRead(char * p) >>>>>>>>>> { >>>>>>>>>> printf("Starting reading file ... \n"); >>>>>>>>>> HANDLE mWorkThread[256]; //max 256 threads >>>>>>>>>> completeIOs = 0; >>>>>>>>>> int slice = int (resultArrayLen/threadCount); >>>>>>>>>> for(int i = 0; i < threadCount; i++) >>>>>>>>>> { >>>>>>>>>> mWorkThread[i] = CreateThread( >>>>>>>>>> NULL, >>>>>>>>>> 0, >>>>>>>>>> FileReadThreadEntry, >>>>>>>>>> (LPVOID)(new input(i*slice,p)), >>>>>>>>>> 0, >>>>>>>>>> NULL); >>>>>>>>>> } >>>>>>>>>> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, INFINITE); >>>>>>>> >>>>>>>>>> printf("File read complete... \n"); >>>>>>>>>> return result; >>>>>>>>>> } >>>>>>>>>> unsigned int DataVerification(int* result, int sampleItem) >>>>>>>>>> { >>>>>>>>>> unsigned int invalid = 0; >>>>>>>>>> for (int i=0; i< resultArrayLen/interval;i++) >>>>>>>>>> { >>>>>>>>>> if (result[i]!=sampleItem) >>>>>>>>>> { >>>>>>>>>> invalid ++; >>>>>>>>>> continue; >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> return invalid; >>>>>>>>>> } >>>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>------------------------------**------------------------------** >>>>>>>> >>>>>>>>------------------ >>>>>>>>> Virtualization & Cloud Management Using Capacity Planning >>>>>>>>> Cloud computing makes use of virtualization - but cloud computing >>>>>>>>> also focuses on allowing computing to be delivered as a service. >>>>>>>>> http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>>>>>>>> ______________________________**_________________ >>>>>>>>> Iometer-devel mailing list >>>>>>>>> Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>>>>>>>> https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>> >>>>>> >>>>>> >>>>>> >>>>> >>>> >>>> >>>> >>> >>>------------------------------------------------------------------------------ >>>Try before you buy = See our experts in action! >>>The most comprehensive online learning library for Microsoft developers >>>is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, >>>Metro Style Apps, more. Free future releases when you subscribe now! >>>http://p.sf.net/sfu/learndevnow-dev2 >>> >>>_______________________________________________ >>>Iometer-devel mailing list >>>Iom...@li... >>>https://lists.sourceforge.net/lists/listinfo/iometer-devel >>> >>> >>> > > > > |
From: Nai y. z. <zha...@gm...> - 2012-02-14 17:53:05
|
Hello Vedran, In fact, before every run I already comment file initialization part. If I missed anything in terms of caching? This time, I seriously rerun my program (file initialization part is already been commented) and then I/O meter after boot. Here is the result, Windows native threads (1 thread) - 28491.6166 IOPS for 512 bytes read IOMeter (1 outstanding I/O) - 117460.61 IOPS for 512bytes sequential read key parameters I used in IOMeter, Max disk size - 25165824 # of outstanding I/O - 1 Transfer request size -512bytes 100% sequential read Align I/O - sector boundaries Ramp up time - 60s Run time - 4 minutes For your convenience, I will use another email to send you snapshots since it will exceed 64KB. Looking forward to your further advice! Thanks. Nai Yan. 2012/2/14 Vedran Degoricija <ve...@ya...>: > Hi Nai Yan, > > You need to double check your file access pattern/stride as the others have > suggested. I'd also validate your file flags since after your initialization > phase, a good portion of your file will be in the cache. > > What does Iometer report in the 28K IOPs case, and what are the workload > parameters you are using? > > Good luck, > Ved > > > > From: Nai yan zhao <zha...@gm...> > To: ve...@ya... > Cc: "Iom...@li..." > <iom...@li...> > Sent: Monday, February 13, 2012 5:35 PM > Subject: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower > than windows multi-threading data fetch IOPS? > > Hello Vedran, > Thank you for your time to reply. However it seems filesystem cache is > not the cause for this problem. I tried 2 times run after boot, and each is > around 28K (although a little less than yesterday). Regarding SSD, I was > trying to ask each read I/O to access different LBA to avoid to hit SSD > cache. > > Any further suggestion? Thanks! > > Nai Yan. > > > > 2012/2/13 Vedran Degoricija <ve...@ya...> > > Nai Yan, > > Your program does not specify anything about the file caching attributes, so > the data is most likely coming out of the filesystem cache. And if you have > a bunch of threads scanning through the same set of LBAs, the SSD cache > might be helping as well. > > Try running 1 thread with 1 iteration right after boot and see what numbers > you get. > > Regards, > Ved > > From: Nai yan zhao <zha...@gm...> > To: jo...@ei... > Cc: Iom...@li... > Sent: Sunday, February 12, 2012 6:59 PM > > Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much > slower than windows multi-threading data fetch IOPS? > > Hello Joe, > Again, thank you for your reply! I will take your suggestion and try > again. But I am very looking forward to your further investigation on > Windows system for my program. > > I trust IOMeter, but I can't explain why and where's the problem with > my program. And further speaking, would you give me some comments? > 1) What's the difference between IOmeter I/O calculation and my program > (although it's much much simpler)? From the behavior of IOMeter, it also > seems to create a file on target disk and MAYBE fetch data from that file by > pre-defined I/O size and policy. If I am wrong? > If I am not wrong, then why there's so much difference. Joe, by > your experience, if my program has any big defect? > > 2) My major purpose is to have a program in our production env. ,which > will frequently fetch data from SSD, and there are also some additional > operations/work after data fetched - this is also why you see I put some > additional work after each I/O (such as memory allocation and de-allocation > in I/O calculation). > What I expect to see, its benchmark SHOULD be less than I/OMeter > benchmark. > > Would you advise more? Is there any big defect in my program for either > doing file I/O or I/O calculation? > > Thanks in advance!! > > Nai Yan. > > > > 2012/2/13 <jo...@ei...> > > Manufacturer's quoted sequential MB/s won't be with 512byte reads. In > Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That > should come closer to the maximum throughput(I doubt you'll be able to get > your laptop to actually get close to 520MB/s though). > > I'll see if I can find a windows system to try to compile/run your program, > but I can't make any promises. > > > Joe > > > Quoting Nai yan zhao <zha...@gm...>: > > Hello Joe, > Thank you again for your time! > It's wired that from IOMeter, the throughput for sequential IOPS > (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, > from that SSD official website, this SSD sequential throughput should be > around 510MB/s ( > http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD > is 128G). If there's any parameter I didn't set correctly in IOMeter? > > As you suggested, I try to create a 12GB sample file (my test bed > memory is 6GB and without RAID) and use 1 thread to do IO. The result > is 33666; However, with I/O meter, it's 11572 (throughput this time is ONLY > 5.93MB/s); IOPS still 3 times!! > > I attach my IOMeter settings, if there's anything wrong? Also, I > attach my modified code. Joe, could you help again to see where's the > problem? > > Thank you so much!! > > Nai Yan. > > 2012/2/13 <jo...@ei...> > > 82K sounds reasonable for iops on an SSD. You should check the specs of > your drive to see what you should expect. > > You need to remember that you are doing file i/o so you have several > layers of cache involved. think of it was file cache -> block cache -> > controller cache -> drive cache (you aren't testing a HW RAID, so you > probably don't have cache in you controller) My personal run of thumb for > random I/O is to have my file size be about 3x my combined cache size. For > example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd > do a 16GB file. > > If in iometer you are accessing a PHYSICALDISK, then you are avoiding > window's file cache. > > I just pulled up the code and (keep in mind I'm not much of a windows guy) > something looks odd in your GetSecs routine. The cast to double is going to > lose resolution, I think I would store the start/end times as > LARGE_INTEGER. And you probably only have to call the frequency routine once > > Also windows used to have issues in the HAL where if a thread got moved to > a different processor you'd get odd results. There is a Windows API call > for setting affinity, similar to the linux sched_set_affinity. > > This doesn't really matter for what we are talking about, it is just a pet > peeve of mine, your "delete c;" should be "delete [] c;" (are you intending > tp be timing your allocator calls as well? you may be if you are simulating > system performance, but typically for disk performance you'd try to > preallocate as much as possible so your only timing the transfers) > > > If it were me I would start with something simplier, (say single threaded > sequential read) and see if your program gets the correct values then. You > could also fire up windows performance monitor and try to correlate to its > counts as well (PHYSICALDISK transfers/sec). > > Good Luck, > > Joe > > > > Quoting Nai yan zhao <zha...@gm...>: > > Hello Fabian and Joe, > > Thank you so much for your reply. > > Actually, what I am trying to do, is to split a file into 32 parts, > and each part will be assigned to a thread to read. Each thread each time > to open file, read 512B, and close file. I was trying to avoid 2 read > I/Os > hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although most > read I/Os are ordered but not > contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> > >> > . > > > By your suggestion, I tried 512B sequential I/O with settings below, > > Max disk size - 8388608 > # of Outstanding I/O - 32 (for 64, it's also around 82K) > Transfer request size - 512B, > 100% sequential > Reply size - no reply > Align I/Os on - Sector boundaries > > The result is around 82K, still much slower than my program. > > If my program has any defect in calculating IOPS? Or if I have any > misunderstanding of caching of SSD or file system, which causes my program > fetches data most from RAM of SSD? Or what parameters I should set in I/O > meter to simulate my program I/O? > > Thank you again in advance for your time to help investigate it!! > > Nai Yan. > > 2012/2/11 Fabian Tillier <fa...@ti...> > > If I read the test correctly, all threads start at offset 0, and then > > perform 512b reads with a 1024b stride between reads. As Joe said, > this is pretty much sequential reading, and all threads are reading > the same data, so most are likely to be satisifed from cache, either > in the OS or on the SSD itself. They'll do 320000/16=20000 IO > operations total each, so end up reading 20MB of the file. It's quite > likely that the whole 20MB that you are reading will sit happilly in > the file cache. > > Create an access pattern that mimics your app (512b sequential with > 1024b stride), create 32 workers, and see if the results are similar. > Best would be if you created a test file of 20MB, too. You can then > see how things compare if you go with async I/O and a single thread. > > Cheers, > -Fab > > On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >> Forgive me if I missed it, but I don't see any randomization in your >> file reads. >> >> It looks like you just skip ahead so thread 0 reads the first >> 512bytes, thread 1 the next 512b. So any storage will be prefetching >> very effectively. >> >> Tell Iometer to do sequential instead of random and see how much >> closer the numbers are. Or better yet, make your program randomize >> its reads over the entire disk. >> >> Joe >> >> >> Quoting Nai yan zhao <zha...@gm...>: >> >>> Greetings, >>> Could anybody help me a little out of my difficulty? >>> >>> I have a SSD and I am trying to use it to simulate my program I/O >>> performance, however, IOPS calculated from my program is much much > faster >>> than IOMeter. >>> >>> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read >>> IOPS is around 94k (queue depth is 32). >>> However my program (32 windows threads) can reach around 500k > 512B >>> IOPS, around 5 times of IOMeter!!! I did data validation but didn't > find >>> any error in data fetching. It's because my data fetching in order? >>> >>> I paste my code belwo (it mainly fetch 512B from file and release > it; >>> I did use 4bytes (an int) to validate program logic and didn't find >>> problem), can anybody help me figure out where I am wrong? >>> >>> Thanks so much in advance!! >>> >>> Nai Yan. >>> >>> #include <stdio.h> >>> #include <Windows.h> >>> /* >>> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>> ** Author: Nai Yan >>> ** Date: Feb. 9th, 2012 >>> **/ >>> //Global variables >>> long completeIOs = 0; >>> long completeBytes = 0; >>> int threadCount = 32; >>> unsigned long long length = 1073741824; //4G test > file >>> int interval = 1024; >>> int resultArrayLen = 320000; >>> int *result = new int[resultArrayLen]; >>> //Method declarison >>> double GetSecs(void); //Calculate out duration >>> int InitPool(long long,char*,int); //Initialize test data for >>> testing, if successful, return 1; otherwise, return a non 1 value. >>> int * FileRead(char * path); >>> unsigned int DataVerification(int*, int sampleItem); >>> //Verify data fetched from pool >>> int main() >>> { >>> int sampleItem = 0x1; >>> char * fPath = "G:\\workspace\\4G.bin"; >>> unsigned int invalidIO = 0; >>> if (InitPool(length,fPath,**sampleItem)!= 1) > >>> printf("File write err... \n"); >>> //start do random I/Os from initialized file >>> double start = GetSecs(); >>> int * fetchResult = FileRead(fPath); >>> double end = GetSecs(); >>> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - > start)); >>> //start data validation, for 4 bytes fetch only >>> // invalidIO = DataVerification(fetchResult,**sampleItem); > >>> // if (invalidIO !=0) >>> // { >>> // printf("Total invalid data fetch IOs are %d", invalidIO); >>> // } >>> return 0; >>> } >>> >>> >>> int InitPool(long long length, char* path, int sample) >>> { >>> printf("Start initializing test data ... \n"); >>> FILE * fp = fopen(path,"wb"); >>> if (fp == NULL) >>> { >>> printf("file open err... \n"); >>> exit (-1); >>> } >>> else //initialize file for testing >>> { >>> fseek(fp,0L,SEEK_SET); >>> for (int i=0; i<length; i++) >>> { >>> fwrite(&sample,sizeof(int),1,**fp); > >>> } >>> fclose(fp); >>> fp = NULL; >>> printf("Data initialization is complete...\n"); >>> return 1; >>> } >>> } >>> double GetSecs(void) >>> { >>> LARGE_INTEGER frequency; >>> LARGE_INTEGER start; >>> if(! QueryPerformanceFrequency(&**frequency)) >>> printf("**QueryPerformanceFrequency Failed\n"); >>> if(! QueryPerformanceCounter(&**start)) >>> printf("**QueryPerformanceCounter Failed\n"); >>> return ((double)start.QuadPart/(**double)frequency.QuadPart); > >>> } >>> class input >>> { >>> public: >>> char *path; >>> int starting; >>> input (int st, char * filePath):starting(st),path(**filePath){} > >>> }; >>> //Workers >>> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>> { >>> input * in = (input*) lpThreadParameter; >>> char* path = in->path; >>> FILE * fp = fopen(path,"rb"); >>> int sPos = in->starting; >>> // int * result = in->r; >>> if(fp != NULL) >>> { >>> fpos_t pos; >>> for (int i=0; i<resultArrayLen/threadCount;**i++) > >>> { >>> pos = i * interval; >>> fsetpos(fp,&pos); >>> //For 512 bytes fetch each time >>> unsigned char *c =new unsigned char [512]; >>> if (fread(c,512,1,fp) ==1) >>> { >>> InterlockedIncrement(&**completeIOs); > >>> delete c; >>> } >>> //For 4 bytes fetch each time >>> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>> { >>> InterlockedIncrement(&**completeIOs); > >>> }*/ >>> else >>> { >>> printf("file read err...\n"); >>> exit(-1); >>> } >>> } >>> fclose(fp); >>> fp = NULL; >>> } >>> else >>> { >>> printf("File open err... \n"); >>> exit(-1); >>> } >>> } >>> int * FileRead(char * p) >>> { >>> printf("Starting reading file ... \n"); >>> HANDLE mWorkThread[256]; //max 256 threads >>> completeIOs = 0; >>> int slice = int (resultArrayLen/threadCount); >>> for(int i = 0; i < threadCount; i++) >>> { >>> mWorkThread[i] = CreateThread( >>> NULL, >>> 0, >>> FileReadThreadEntry, >>> (LPVOID)(new input(i*slice,p)), >>> 0, >>> NULL); >>> } >>> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, INFINITE); > >>> printf("File read complete... \n"); >>> return result; >>> } >>> unsigned int DataVerification(int* result, int sampleItem) >>> { >>> unsigned int invalid = 0; >>> for (int i=0; i< resultArrayLen/interval;i++) >>> { >>> if (result[i]!=sampleItem) >>> { >>> invalid ++; >>> continue; >>> } >>> } >>> return invalid; >>> } >>> >> >> >> >> >> > ------------------------------**------------------------------** > > ------------------ >> Virtualization & Cloud Management Using Capacity Planning >> Cloud computing makes use of virtualization - but cloud computing >> also focuses on allowing computing to be delivered as a service. >> >> http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >> ______________________________**_________________ >> Iometer-devel mailing list >> Iometer-devel@lists.**sourceforge.net<Iom...@li...> >> >> https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> > > > > > > > > > > > > > ------------------------------------------------------------------------------ > Try before you buy = See our experts in action! > The most comprehensive online learning library for Microsoft developers > is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, > Metro Style Apps, more. Free future releases when you subscribe now! > http://p.sf.net/sfu/learndevnow-dev2 > > _______________________________________________ > Iometer-devel mailing list > Iom...@li... > https://lists.sourceforge.net/lists/listinfo/iometer-devel > > > > > > |
From: Nai y. z. <zha...@gm...> - 2012-02-14 18:10:01
|
Hello Joe, I just tried 256KB with 8 outstanding I/Os. It's ONLY 188.84MB. >From my official SSD benchmark - sequential read is 510MB/s (mine is PX-128M3S, http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1). My IOMeter version is iometer-1.1.0-rc1-win32.i386-bin. OS is windows 7 64bit Home Premium SP1. Memory is 6GB, CPU is Intel i5-2430M. I will send you snapshots via another email, since its size exceeds 64KB. If I didn't use IOMeter right? Again, would you help explain more about difference between I/O meter read I/O calculation and my program's? Please advise!! Thanks. Nai Yan. 2012/2/13 <jo...@ei...>: > Manufacturer's quoted sequential MB/s won't be with 512byte reads. In > Iometer, try 256KB sequential reads with about 8 outstanding I/Os. That > should come closer to the maximum throughput(I doubt you'll be able to get > your laptop to actually get close to 520MB/s though). > > I'll see if I can find a windows system to try to compile/run your program, > but I can't make any promises. > > > Joe > > > Quoting Nai yan zhao <zha...@gm...>: > >> Hello Joe, >> Thank you again for your time! >> It's wired that from IOMeter, the throughput for sequential IOPS >> (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. However, >> from that SSD official website, this SSD sequential throughput should be >> around 510MB/s ( >> http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my SSD >> is 128G). If there's any parameter I didn't set correctly in IOMeter? >> >> As you suggested, I try to create a 12GB sample file (my test bed >> memory is 6GB and without RAID) and use 1 thread to do IO. The result >> is 33666; However, with I/O meter, it's 11572 (throughput this time is >> ONLY >> 5.93MB/s); IOPS still 3 times!! >> >> I attach my IOMeter settings, if there's anything wrong? Also, I >> attach my modified code. Joe, could you help again to see where's the >> problem? >> >> Thank you so much!! >> >> Nai Yan. >> >> 2012/2/13 <jo...@ei...> >> >>> 82K sounds reasonable for iops on an SSD. You should check the specs of >>> your drive to see what you should expect. >>> >>> You need to remember that you are doing file i/o so you have several >>> layers of cache involved. think of it was file cache -> block cache -> >>> controller cache -> drive cache (you aren't testing a HW RAID, so you >>> probably don't have cache in you controller) My personal run of thumb for >>> random I/O is to have my file size be about 3x my combined cache size. >>> For >>> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = 4.75GB I'd >>> do a 16GB file. >>> >>> If in iometer you are accessing a PHYSICALDISK, then you are avoiding >>> window's file cache. >>> >>> I just pulled up the code and (keep in mind I'm not much of a windows >>> guy) >>> something looks odd in your GetSecs routine. The cast to double is going >>> to >>> lose resolution, I think I would store the start/end times as >>> LARGE_INTEGER. And you probably only have to call the frequency routine >>> once >>> >>> Also windows used to have issues in the HAL where if a thread got moved >>> to >>> a different processor you'd get odd results. There is a Windows API call >>> for setting affinity, similar to the linux sched_set_affinity. >>> >>> This doesn't really matter for what we are talking about, it is just a >>> pet >>> peeve of mine, your "delete c;" should be "delete [] c;" (are you >>> intending >>> tp be timing your allocator calls as well? you may be if you are >>> simulating >>> system performance, but typically for disk performance you'd try to >>> preallocate as much as possible so your only timing the transfers) >>> >>> >>> If it were me I would start with something simplier, (say single threaded >>> sequential read) and see if your program gets the correct values then. >>> You >>> could also fire up windows performance monitor and try to correlate to >>> its >>> counts as well (PHYSICALDISK transfers/sec). >>> >>> Good Luck, >>> >>> Joe >>> >>> >>> >>> Quoting Nai yan zhao <zha...@gm...>: >>> >>> Hello Fabian and Joe, >>>> >>>> Thank you so much for your reply. >>>> >>>> Actually, what I am trying to do, is to split a file into 32 parts, >>>> and each part will be assigned to a thread to read. Each thread each >>>> time >>>> to open file, read 512B, and close file. I was trying to avoid 2 read >>>> I/Os >>>> hit 1 block(512B) - i.e. to avoid cache in SSD (it's 128MB), although >>>> most >>>> read I/Os are ordered but not >>>> >>>> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_science<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>> >>>> > >>>> . >>>> >>>> >>>> By your suggestion, I tried 512B sequential I/O with settings below, >>>> >>>> Max disk size - 8388608 >>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>> Transfer request size - 512B, >>>> 100% sequential >>>> Reply size - no reply >>>> Align I/Os on - Sector boundaries >>>> >>>> The result is around 82K, still much slower than my program. >>>> >>>> If my program has any defect in calculating IOPS? Or if I have any >>>> misunderstanding of caching of SSD or file system, which causes my >>>> program >>>> fetches data most from RAM of SSD? Or what parameters I should set in >>>> I/O >>>> meter to simulate my program I/O? >>>> >>>> Thank you again in advance for your time to help investigate it!! >>>> >>>> Nai Yan. >>>> >>>> 2012/2/11 Fabian Tillier <fa...@ti...> >>>> >>>> If I read the test correctly, all threads start at offset 0, and then >>>>> >>>>> perform 512b reads with a 1024b stride between reads. As Joe said, >>>>> this is pretty much sequential reading, and all threads are reading >>>>> the same data, so most are likely to be satisifed from cache, either >>>>> in the OS or on the SSD itself. They'll do 320000/16=20000 IO >>>>> operations total each, so end up reading 20MB of the file. It's quite >>>>> likely that the whole 20MB that you are reading will sit happilly in >>>>> the file cache. >>>>> >>>>> Create an access pattern that mimics your app (512b sequential with >>>>> 1024b stride), create 32 workers, and see if the results are similar. >>>>> Best would be if you created a test file of 20MB, too. You can then >>>>> see how things compare if you go with async I/O and a single thread. >>>>> >>>>> Cheers, >>>>> -Fab >>>>> >>>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>> > Forgive me if I missed it, but I don't see any randomization in your >>>>> > file reads. >>>>> > >>>>> > It looks like you just skip ahead so thread 0 reads the first >>>>> > 512bytes, thread 1 the next 512b. So any storage will be prefetching >>>>> > very effectively. >>>>> > >>>>> > Tell Iometer to do sequential instead of random and see how much >>>>> > closer the numbers are. Or better yet, make your program randomize >>>>> > its reads over the entire disk. >>>>> > >>>>> > Joe >>>>> > >>>>> > >>>>> > Quoting Nai yan zhao <zha...@gm...>: >>>>> > >>>>> >> Greetings, >>>>> >> Could anybody help me a little out of my difficulty? >>>>> >> >>>>> >> I have a SSD and I am trying to use it to simulate my program >>>>> >> I/O >>>>> >> performance, however, IOPS calculated from my program is much much >>>>> faster >>>>> >> than IOMeter. >>>>> >> >>>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random >>>>> >> read >>>>> >> IOPS is around 94k (queue depth is 32). >>>>> >> However my program (32 windows threads) can reach around 500k >>>>> 512B >>>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but didn't >>>>> find >>>>> >> any error in data fetching. It's because my data fetching in order? >>>>> >> >>>>> >> I paste my code belwo (it mainly fetch 512B from file and >>>>> >> release >>>>> it; >>>>> >> I did use 4bytes (an int) to validate program logic and didn't find >>>>> >> problem), can anybody help me figure out where I am wrong? >>>>> >> >>>>> >> Thanks so much in advance!! >>>>> >> >>>>> >> Nai Yan. >>>>> >> >>>>> >> #include <stdio.h> >>>>> >> #include <Windows.h> >>>>> >> /* >>>>> >> ** Purpose: Verify file random read IOPS in comparison with IOMeter >>>>> >> ** Author: Nai Yan >>>>> >> ** Date: Feb. 9th, 2012 >>>>> >> **/ >>>>> >> //Global variables >>>>> >> long completeIOs = 0; >>>>> >> long completeBytes = 0; >>>>> >> int threadCount = 32; >>>>> >> unsigned long long length = 1073741824; //4G test >>>>> file >>>>> >> int interval = 1024; >>>>> >> int resultArrayLen = 320000; >>>>> >> int *result = new int[resultArrayLen]; >>>>> >> //Method declarison >>>>> >> double GetSecs(void); //Calculate out duration >>>>> >> int InitPool(long long,char*,int); //Initialize test data >>>>> >> for >>>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>>> >> int * FileRead(char * path); >>>>> >> unsigned int DataVerification(int*, int sampleItem); >>>>> >> //Verify data fetched from pool >>>>> >> int main() >>>>> >> { >>>>> >> int sampleItem = 0x1; >>>>> >> char * fPath = "G:\\workspace\\4G.bin"; >>>>> >> unsigned int invalidIO = 0; >>>>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>> >>>>> >> printf("File write err... \n"); >>>>> >> //start do random I/Os from initialized file >>>>> >> double start = GetSecs(); >>>>> >> int * fetchResult = FileRead(fPath); >>>>> >> double end = GetSecs(); >>>>> >> printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - >>>>> start)); >>>>> >> //start data validation, for 4 bytes fetch only >>>>> >> // invalidIO = DataVerification(fetchResult,**sampleItem); >>>>> >>>>> >> // if (invalidIO !=0) >>>>> >> // { >>>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); >>>>> >> // } >>>>> >> return 0; >>>>> >> } >>>>> >> >>>>> >> >>>>> >> int InitPool(long long length, char* path, int sample) >>>>> >> { >>>>> >> printf("Start initializing test data ... \n"); >>>>> >> FILE * fp = fopen(path,"wb"); >>>>> >> if (fp == NULL) >>>>> >> { >>>>> >> printf("file open err... \n"); >>>>> >> exit (-1); >>>>> >> } >>>>> >> else //initialize file for testing >>>>> >> { >>>>> >> fseek(fp,0L,SEEK_SET); >>>>> >> for (int i=0; i<length; i++) >>>>> >> { >>>>> >> fwrite(&sample,sizeof(int),1,**fp); >>>>> >>>>> >> } >>>>> >> fclose(fp); >>>>> >> fp = NULL; >>>>> >> printf("Data initialization is complete...\n"); >>>>> >> return 1; >>>>> >> } >>>>> >> } >>>>> >> double GetSecs(void) >>>>> >> { >>>>> >> LARGE_INTEGER frequency; >>>>> >> LARGE_INTEGER start; >>>>> >> if(! QueryPerformanceFrequency(&**frequency)) >>>>> >> printf("**QueryPerformanceFrequency Failed\n"); >>>>> >> if(! QueryPerformanceCounter(&**start)) >>>>> >> printf("**QueryPerformanceCounter Failed\n"); >>>>> >> return ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>> >>>>> >> } >>>>> >> class input >>>>> >> { >>>>> >> public: >>>>> >> char *path; >>>>> >> int starting; >>>>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>>>> >>>>> >> }; >>>>> >> //Workers >>>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) >>>>> >> { >>>>> >> input * in = (input*) lpThreadParameter; >>>>> >> char* path = in->path; >>>>> >> FILE * fp = fopen(path,"rb"); >>>>> >> int sPos = in->starting; >>>>> >> // int * result = in->r; >>>>> >> if(fp != NULL) >>>>> >> { >>>>> >> fpos_t pos; >>>>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>> >>>>> >> { >>>>> >> pos = i * interval; >>>>> >> fsetpos(fp,&pos); >>>>> >> //For 512 bytes fetch each time >>>>> >> unsigned char *c =new unsigned char [512]; >>>>> >> if (fread(c,512,1,fp) ==1) >>>>> >> { >>>>> >> InterlockedIncrement(&**completeIOs); >>>>> >>>>> >> delete c; >>>>> >> } >>>>> >> //For 4 bytes fetch each time >>>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) >>>>> >> { >>>>> >> InterlockedIncrement(&**completeIOs); >>>>> >>>>> >> }*/ >>>>> >> else >>>>> >> { >>>>> >> printf("file read err...\n"); >>>>> >> exit(-1); >>>>> >> } >>>>> >> } >>>>> >> fclose(fp); >>>>> >> fp = NULL; >>>>> >> } >>>>> >> else >>>>> >> { >>>>> >> printf("File open err... \n"); >>>>> >> exit(-1); >>>>> >> } >>>>> >> } >>>>> >> int * FileRead(char * p) >>>>> >> { >>>>> >> printf("Starting reading file ... \n"); >>>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>>> >> completeIOs = 0; >>>>> >> int slice = int (resultArrayLen/threadCount); >>>>> >> for(int i = 0; i < threadCount; i++) >>>>> >> { >>>>> >> mWorkThread[i] = CreateThread( >>>>> >> NULL, >>>>> >> 0, >>>>> >> FileReadThreadEntry, >>>>> >> (LPVOID)(new input(i*slice,p)), >>>>> >> 0, >>>>> >> NULL); >>>>> >> } >>>>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, >>>>> >> INFINITE); >>>>> >>>>> >> printf("File read complete... \n"); >>>>> >> return result; >>>>> >> } >>>>> >> unsigned int DataVerification(int* result, int sampleItem) >>>>> >> { >>>>> >> unsigned int invalid = 0; >>>>> >> for (int i=0; i< resultArrayLen/interval;i++) >>>>> >> { >>>>> >> if (result[i]!=sampleItem) >>>>> >> { >>>>> >> invalid ++; >>>>> >> continue; >>>>> >> } >>>>> >> } >>>>> >> return invalid; >>>>> >> } >>>>> >> >>>>> > >>>>> > >>>>> > >>>>> > >>>>> > >>>>> ------------------------------**------------------------------** >>>>> >>>>> ------------------ >>>>> > Virtualization & Cloud Management Using Capacity Planning >>>>> > Cloud computing makes use of virtualization - but cloud computing >>>>> > also focuses on allowing computing to be delivered as a service. >>>>> > >>>>> > http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.accelacomm.com/jaw/sfnl/114/51521223/> >>>>> > ______________________________**_________________ >>>>> > Iometer-devel mailing list >>>>> > >>>>> > Iometer-devel@lists.**sourceforge.net<Iom...@li...> >>>>> > >>>>> > https://lists.sourceforge.net/**lists/listinfo/iometer-devel<https://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>> >>>>> >>>> >>> >>> >>> >> > > > |
From: Wesley B. <ad...@we...> - 2012-02-14 20:20:15
|
Hello! Just catching up on the list emails. Since you are on a windows system and you are wanting to get the same kinds of numbers as ioMeter you really should be using WriteFile and set the flags to unbuffered using FILE_FLAG_WRITE_THROUGH. With fwrite is a buffered write in most cases on windows. You should also make sure your buffer is sector aligned with the disk in question. Most disks are 512 byte sectors but there are newer 4k sector size disks and SSDs hitting the market as we speak. Since you aren't looking to do overlapped async IO through fwrite then just setting the write through flag should be what you are after. http://msdn.microsoft.com/en-us/library/windows/desktop/aa365747(v=vs.85).aspx -----Original Message----- From: Nai yan zhao [mailto:zha...@gm...] Sent: Tuesday, February 14, 2012 12:10 PM To: jo...@ei... Cc: Iom...@li... Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? Hello Joe, I just tried 256KB with 8 outstanding I/Os. It's ONLY 188.84MB. >From my official SSD benchmark - sequential read is 510MB/s (mine is PX-128M3S, http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1). My IOMeter version is iometer-1.1.0-rc1-win32.i386-bin. OS is windows 7 64bit Home Premium SP1. Memory is 6GB, CPU is Intel i5-2430M. I will send you snapshots via another email, since its size exceeds 64KB. If I didn't use IOMeter right? Again, would you help explain more about difference between I/O meter read I/O calculation and my program's? Please advise!! Thanks. Nai Yan. 2012/2/13 <jo...@ei...>: > Manufacturer's quoted sequential MB/s won't be with 512byte reads. In > Iometer, try 256KB sequential reads with about 8 outstanding I/Os. > That should come closer to the maximum throughput(I doubt you'll be > able to get your laptop to actually get close to 520MB/s though). > > I'll see if I can find a windows system to try to compile/run your > program, but I can't make any promises. > > > Joe > > > Quoting Nai yan zhao <zha...@gm...>: > >> Hello Joe, >> Thank you again for your time! >> It's wired that from IOMeter, the throughput for sequential IOPS >> (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. >> However, from that SSD official website, this SSD sequential >> throughput should be around 510MB/s ( >> http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my >> SSD is 128G). If there's any parameter I didn't set correctly in IOMeter? >> >> As you suggested, I try to create a 12GB sample file (my test bed >> memory is 6GB and without RAID) and use 1 thread to do IO. The result >> is 33666; However, with I/O meter, it's 11572 (throughput this time >> is ONLY 5.93MB/s); IOPS still 3 times!! >> >> I attach my IOMeter settings, if there's anything wrong? Also, I >> attach my modified code. Joe, could you help again to see where's >> the problem? >> >> Thank you so much!! >> >> Nai Yan. >> >> 2012/2/13 <jo...@ei...> >> >>> 82K sounds reasonable for iops on an SSD. You should check the specs >>> of your drive to see what you should expect. >>> >>> You need to remember that you are doing file i/o so you have several >>> layers of cache involved. think of it was file cache -> block cache >>> -> controller cache -> drive cache (you aren't testing a HW RAID, so >>> you probably don't have cache in you controller) My personal run of >>> thumb for random I/O is to have my file size be about 3x my combined cache size. >>> For >>> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = >>> 4.75GB I'd do a 16GB file. >>> >>> If in iometer you are accessing a PHYSICALDISK, then you are >>> avoiding window's file cache. >>> >>> I just pulled up the code and (keep in mind I'm not much of a >>> windows >>> guy) >>> something looks odd in your GetSecs routine. The cast to double is >>> going to lose resolution, I think I would store the start/end times >>> as LARGE_INTEGER. And you probably only have to call the frequency >>> routine once >>> >>> Also windows used to have issues in the HAL where if a thread got >>> moved to a different processor you'd get odd results. There is a >>> Windows API call for setting affinity, similar to the linux >>> sched_set_affinity. >>> >>> This doesn't really matter for what we are talking about, it is just >>> a pet peeve of mine, your "delete c;" should be "delete [] c;" (are >>> you intending tp be timing your allocator calls as well? you may be >>> if you are simulating system performance, but typically for disk >>> performance you'd try to preallocate as much as possible so your >>> only timing the transfers) >>> >>> >>> If it were me I would start with something simplier, (say single >>> threaded sequential read) and see if your program gets the correct values then. >>> You >>> could also fire up windows performance monitor and try to correlate >>> to its counts as well (PHYSICALDISK transfers/sec). >>> >>> Good Luck, >>> >>> Joe >>> >>> >>> >>> Quoting Nai yan zhao <zha...@gm...>: >>> >>> Hello Fabian and Joe, >>>> >>>> Thank you so much for your reply. >>>> >>>> Actually, what I am trying to do, is to split a file into 32 >>>> parts, and each part will be assigned to a thread to read. Each >>>> thread each time to open file, read 512B, and close file. I was >>>> trying to avoid 2 read I/Os hit 1 block(512B) - i.e. to avoid cache >>>> in SSD (it's 128MB), although most read I/Os are ordered but not >>>> >>>> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_sci >>>> ence<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>> >>>> > >>>> . >>>> >>>> >>>> By your suggestion, I tried 512B sequential I/O with settings >>>> below, >>>> >>>> Max disk size - 8388608 >>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>> Transfer request size - 512B, >>>> 100% sequential >>>> Reply size - no reply >>>> Align I/Os on - Sector boundaries >>>> >>>> The result is around 82K, still much slower than my program. >>>> >>>> If my program has any defect in calculating IOPS? Or if I have >>>> any misunderstanding of caching of SSD or file system, which causes >>>> my program fetches data most from RAM of SSD? Or what parameters I >>>> should set in I/O meter to simulate my program I/O? >>>> >>>> Thank you again in advance for your time to help investigate it!! >>>> >>>> Nai Yan. >>>> >>>> 2012/2/11 Fabian Tillier <fa...@ti...> >>>> >>>> If I read the test correctly, all threads start at offset 0, and >>>> then >>>>> >>>>> perform 512b reads with a 1024b stride between reads. As Joe >>>>> said, this is pretty much sequential reading, and all threads are >>>>> reading the same data, so most are likely to be satisifed from >>>>> cache, either in the OS or on the SSD itself. They'll do >>>>> 320000/16=20000 IO operations total each, so end up reading 20MB >>>>> of the file. It's quite likely that the whole 20MB that you are >>>>> reading will sit happilly in the file cache. >>>>> >>>>> Create an access pattern that mimics your app (512b sequential >>>>> with 1024b stride), create 32 workers, and see if the results are similar. >>>>> Best would be if you created a test file of 20MB, too. You can >>>>> then see how things compare if you go with async I/O and a single thread. >>>>> >>>>> Cheers, >>>>> -Fab >>>>> >>>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>> > Forgive me if I missed it, but I don't see any randomization in >>>>> > your file reads. >>>>> > >>>>> > It looks like you just skip ahead so thread 0 reads the first >>>>> > 512bytes, thread 1 the next 512b. So any storage will be >>>>> > prefetching very effectively. >>>>> > >>>>> > Tell Iometer to do sequential instead of random and see how much >>>>> > closer the numbers are. Or better yet, make your program >>>>> > randomize its reads over the entire disk. >>>>> > >>>>> > Joe >>>>> > >>>>> > >>>>> > Quoting Nai yan zhao <zha...@gm...>: >>>>> > >>>>> >> Greetings, >>>>> >> Could anybody help me a little out of my difficulty? >>>>> >> >>>>> >> I have a SSD and I am trying to use it to simulate my >>>>> >> program I/O performance, however, IOPS calculated from my >>>>> >> program is much much >>>>> faster >>>>> >> than IOMeter. >>>>> >> >>>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B >>>>> >> random read IOPS is around 94k (queue depth is 32). >>>>> >> However my program (32 windows threads) can reach around >>>>> >> 500k >>>>> 512B >>>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but >>>>> >> didn't >>>>> find >>>>> >> any error in data fetching. It's because my data fetching in order? >>>>> >> >>>>> >> I paste my code belwo (it mainly fetch 512B from file and >>>>> >> release >>>>> it; >>>>> >> I did use 4bytes (an int) to validate program logic and didn't >>>>> >> find problem), can anybody help me figure out where I am wrong? >>>>> >> >>>>> >> Thanks so much in advance!! >>>>> >> >>>>> >> Nai Yan. >>>>> >> >>>>> >> #include <stdio.h> >>>>> >> #include <Windows.h> >>>>> >> /* >>>>> >> ** Purpose: Verify file random read IOPS in comparison with >>>>> >> IOMeter >>>>> >> ** Author: Nai Yan >>>>> >> ** Date: Feb. 9th, 2012 >>>>> >> **/ >>>>> >> //Global variables >>>>> >> long completeIOs = 0; >>>>> >> long completeBytes = 0; >>>>> >> int threadCount = 32; >>>>> >> unsigned long long length = 1073741824; //4G test >>>>> file >>>>> >> int interval = 1024; >>>>> >> int resultArrayLen = 320000; >>>>> >> int *result = new int[resultArrayLen]; //Method declarison >>>>> >> double GetSecs(void); //Calculate out duration >>>>> >> int InitPool(long long,char*,int); //Initialize test data >>>>> >> for >>>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>>> >> int * FileRead(char * path); >>>>> >> unsigned int DataVerification(int*, int sampleItem); //Verify >>>>> >> data fetched from pool int main() { int sampleItem = 0x1; char >>>>> >> * fPath = "G:\\workspace\\4G.bin"; unsigned int invalidIO = 0; >>>>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>> >>>>> >> printf("File write err... \n"); //start do random I/Os from >>>>> >> initialized file double start = GetSecs(); int * fetchResult = >>>>> >> FileRead(fPath); double end = GetSecs(); printf("File read >>>>> >> IOPS is %.4f per second.. \n",completeIOs/(end - >>>>> start)); >>>>> >> //start data validation, for 4 bytes fetch only // invalidIO = >>>>> >> DataVerification(fetchResult,**sampleItem); >>>>> >>>>> >> // if (invalidIO !=0) >>>>> >> // { >>>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); // >>>>> >> } return 0; } >>>>> >> >>>>> >> >>>>> >> int InitPool(long long length, char* path, int sample) { >>>>> >> printf("Start initializing test data ... \n"); FILE * fp = >>>>> >> fopen(path,"wb"); if (fp == NULL) { printf("file open err... >>>>> >> \n"); exit (-1); } else //initialize file for testing { >>>>> >> fseek(fp,0L,SEEK_SET); for (int i=0; i<length; i++) { >>>>> >> fwrite(&sample,sizeof(int),1,**fp); >>>>> >>>>> >> } >>>>> >> fclose(fp); >>>>> >> fp = NULL; >>>>> >> printf("Data initialization is complete...\n"); return 1; } } >>>>> >> double GetSecs(void) { >>>>> >> LARGE_INTEGER frequency; >>>>> >> LARGE_INTEGER start; >>>>> >> if(! QueryPerformanceFrequency(&**frequency)) >>>>> >> printf("**QueryPerformanceFrequency Failed\n"); >>>>> >> if(! QueryPerformanceCounter(&**start)) >>>>> >> printf("**QueryPerformanceCounter Failed\n"); return >>>>> >> ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>> >>>>> >> } >>>>> >> class input >>>>> >> { >>>>> >> public: >>>>> >> char *path; >>>>> >> int starting; >>>>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>>>> >>>>> >> }; >>>>> >> //Workers >>>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) { >>>>> >> input * in = (input*) lpThreadParameter; >>>>> >> char* path = in->path; >>>>> >> FILE * fp = fopen(path,"rb"); >>>>> >> int sPos = in->starting; >>>>> >> // int * result = in->r; >>>>> >> if(fp != NULL) >>>>> >> { >>>>> >> fpos_t pos; >>>>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>> >>>>> >> { >>>>> >> pos = i * interval; >>>>> >> fsetpos(fp,&pos); >>>>> >> //For 512 bytes fetch each time unsigned char *c =new unsigned >>>>> >> char [512]; if (fread(c,512,1,fp) ==1) { >>>>> >> InterlockedIncrement(&**completeIOs); >>>>> >>>>> >> delete c; >>>>> >> } >>>>> >> //For 4 bytes fetch each time >>>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) { >>>>> >> InterlockedIncrement(&**completeIOs); >>>>> >>>>> >> }*/ >>>>> >> else >>>>> >> { >>>>> >> printf("file read err...\n"); >>>>> >> exit(-1); >>>>> >> } >>>>> >> } >>>>> >> fclose(fp); >>>>> >> fp = NULL; >>>>> >> } >>>>> >> else >>>>> >> { >>>>> >> printf("File open err... \n"); >>>>> >> exit(-1); >>>>> >> } >>>>> >> } >>>>> >> int * FileRead(char * p) >>>>> >> { >>>>> >> printf("Starting reading file ... \n"); >>>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>>> >> completeIOs = 0; >>>>> >> int slice = int (resultArrayLen/threadCount); for(int i = 0; i >>>>> >> < threadCount; i++) { mWorkThread[i] = CreateThread( NULL, 0, >>>>> >> FileReadThreadEntry, (LPVOID)(new input(i*slice,p)), 0, NULL); >>>>> >> } >>>>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, >>>>> >> INFINITE); >>>>> >>>>> >> printf("File read complete... \n"); >>>>> >> return result; >>>>> >> } >>>>> >> unsigned int DataVerification(int* result, int sampleItem) { >>>>> >> unsigned int invalid = 0; for (int i=0; i< >>>>> >> resultArrayLen/interval;i++) { if (result[i]!=sampleItem) { >>>>> >> invalid ++; continue; } } return invalid; } >>>>> >> >>>>> > >>>>> > >>>>> > >>>>> > >>>>> > >>>>> ------------------------------**------------------------------** >>>>> >>>>> ------------------ >>>>> > Virtualization & Cloud Management Using Capacity Planning Cloud >>>>> > computing makes use of virtualization - but cloud computing also >>>>> > focuses on allowing computing to be delivered as a service. >>>>> > >>>>> > http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.ac >>>>> > celacomm.com/jaw/sfnl/114/51521223/> >>>>> > ______________________________**_________________ >>>>> > Iometer-devel mailing list >>>>> > >>>>> > Iometer-devel@lists.**sourceforge.net<Iom...@li...urce >>>>> > forge.net> >>>>> > >>>>> > https://lists.sourceforge.net/**lists/listinfo/iometer-devel<htt >>>>> > ps://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>> >>>>> >>>> >>> >>> >>> >> > > > ------------------------------------------------------------------------------ Keep Your Developer Skills Current with LearnDevNow! The most comprehensive online learning library for Microsoft developers is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, Metro Style Apps, more. Free future releases when you subscribe now! http://p.sf.net/sfu/learndevnow-d2d _______________________________________________ Iometer-devel mailing list Iom...@li... https://lists.sourceforge.net/lists/listinfo/iometer-devel |
From: Nai y. z. <zha...@gm...> - 2012-02-15 02:32:37
|
Hello Wesley, Thank you for your time to catch up this long mail list. I am not sure if "FILE_FLAG_WRITE_THROUGH" can really help read IO calculation, which is what I am concerning about. But I will try it later. Actually, after test file written to disk, even though I reboot my machine and rerun my program (this time, not write same test file to disk and just do read file operation), the test result reflected what windows threads reading is much faster than I/O meter. What I have been trying to understand where's the problem, either my program defect (specially for data fetching (maybe leverage cache in SSD or FS in a certain way during read), and IOPS calculation), either the wrong way I used I/O meter for read IO evaluation, or the huge difference in the read I/O operation manner between windows threads reading data and I/O meter. Per your experience, where's the problem at most possibility? I believe I/OMeter is mature enough and the problem should be on my side. You and other developers in this mail list are the right persons to help me out this difficulty!! Again, IOMeter I was using is 1.1 rc downloaded from sourceforge. Is that the problem? Maybe I need to try older version. Any suggestion? Thanks in advance!! Nai Yan. 2012/2/15 Wesley Brown <ad...@we...>: > Hello! > > Just catching up on the list emails. > > Since you are on a windows system and you are wanting to get the same kinds of numbers as ioMeter you really should be using WriteFile and set the flags to unbuffered using FILE_FLAG_WRITE_THROUGH. With fwrite is a buffered write in most cases on windows. You should also make sure your buffer is sector aligned with the disk in question. Most disks are 512 byte sectors but there are newer 4k sector size disks and SSDs hitting the market as we speak. Since you aren't looking to do overlapped async IO through fwrite then just setting the write through flag should be what you are after. > > http://msdn.microsoft.com/en-us/library/windows/desktop/aa365747(v=vs.85).aspx > > > > -----Original Message----- > From: Nai yan zhao [mailto:zha...@gm...] > Sent: Tuesday, February 14, 2012 12:10 PM > To: jo...@ei... > Cc: Iom...@li... > Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? > > Hello Joe, > I just tried 256KB with 8 outstanding I/Os. It's ONLY 188.84MB. > From my official SSD benchmark - sequential read is 510MB/s (mine is PX-128M3S, http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1). > My IOMeter version is iometer-1.1.0-rc1-win32.i386-bin. > OS is windows 7 64bit Home Premium SP1. > Memory is 6GB, > CPU is Intel i5-2430M. > > I will send you snapshots via another email, since its size exceeds 64KB. > > If I didn't use IOMeter right? Again, would you help explain more about difference between I/O meter read I/O calculation and my program's? > > Please advise!! > > Thanks. > > Nai Yan. > > 2012/2/13 <jo...@ei...>: >> Manufacturer's quoted sequential MB/s won't be with 512byte reads. In >> Iometer, try 256KB sequential reads with about 8 outstanding I/Os. >> That should come closer to the maximum throughput(I doubt you'll be >> able to get your laptop to actually get close to 520MB/s though). >> >> I'll see if I can find a windows system to try to compile/run your >> program, but I can't make any promises. >> >> >> Joe >> >> >> Quoting Nai yan zhao <zha...@gm...>: >> >>> Hello Joe, >>> Thank you again for your time! >>> It's wired that from IOMeter, the throughput for sequential IOPS >>> (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. >>> However, from that SSD official website, this SSD sequential >>> throughput should be around 510MB/s ( >>> http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my >>> SSD is 128G). If there's any parameter I didn't set correctly in IOMeter? >>> >>> As you suggested, I try to create a 12GB sample file (my test bed >>> memory is 6GB and without RAID) and use 1 thread to do IO. The result >>> is 33666; However, with I/O meter, it's 11572 (throughput this time >>> is ONLY 5.93MB/s); IOPS still 3 times!! >>> >>> I attach my IOMeter settings, if there's anything wrong? Also, I >>> attach my modified code. Joe, could you help again to see where's >>> the problem? >>> >>> Thank you so much!! >>> >>> Nai Yan. >>> >>> 2012/2/13 <jo...@ei...> >>> >>>> 82K sounds reasonable for iops on an SSD. You should check the specs >>>> of your drive to see what you should expect. >>>> >>>> You need to remember that you are doing file i/o so you have several >>>> layers of cache involved. think of it was file cache -> block cache >>>> -> controller cache -> drive cache (you aren't testing a HW RAID, so >>>> you probably don't have cache in you controller) My personal run of >>>> thumb for random I/O is to have my file size be about 3x my combined cache size. >>>> For >>>> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = >>>> 4.75GB I'd do a 16GB file. >>>> >>>> If in iometer you are accessing a PHYSICALDISK, then you are >>>> avoiding window's file cache. >>>> >>>> I just pulled up the code and (keep in mind I'm not much of a >>>> windows >>>> guy) >>>> something looks odd in your GetSecs routine. The cast to double is >>>> going to lose resolution, I think I would store the start/end times >>>> as LARGE_INTEGER. And you probably only have to call the frequency >>>> routine once >>>> >>>> Also windows used to have issues in the HAL where if a thread got >>>> moved to a different processor you'd get odd results. There is a >>>> Windows API call for setting affinity, similar to the linux >>>> sched_set_affinity. >>>> >>>> This doesn't really matter for what we are talking about, it is just >>>> a pet peeve of mine, your "delete c;" should be "delete [] c;" (are >>>> you intending tp be timing your allocator calls as well? you may be >>>> if you are simulating system performance, but typically for disk >>>> performance you'd try to preallocate as much as possible so your >>>> only timing the transfers) >>>> >>>> >>>> If it were me I would start with something simplier, (say single >>>> threaded sequential read) and see if your program gets the correct values then. >>>> You >>>> could also fire up windows performance monitor and try to correlate >>>> to its counts as well (PHYSICALDISK transfers/sec). >>>> >>>> Good Luck, >>>> >>>> Joe >>>> >>>> >>>> >>>> Quoting Nai yan zhao <zha...@gm...>: >>>> >>>> Hello Fabian and Joe, >>>>> >>>>> Thank you so much for your reply. >>>>> >>>>> Actually, what I am trying to do, is to split a file into 32 >>>>> parts, and each part will be assigned to a thread to read. Each >>>>> thread each time to open file, read 512B, and close file. I was >>>>> trying to avoid 2 read I/Os hit 1 block(512B) - i.e. to avoid cache >>>>> in SSD (it's 128MB), although most read I/Os are ordered but not >>>>> >>>>> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_sci >>>>> ence<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>>> >>>>> > >>>>> . >>>>> >>>>> >>>>> By your suggestion, I tried 512B sequential I/O with settings >>>>> below, >>>>> >>>>> Max disk size - 8388608 >>>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>>> Transfer request size - 512B, >>>>> 100% sequential >>>>> Reply size - no reply >>>>> Align I/Os on - Sector boundaries >>>>> >>>>> The result is around 82K, still much slower than my program. >>>>> >>>>> If my program has any defect in calculating IOPS? Or if I have >>>>> any misunderstanding of caching of SSD or file system, which causes >>>>> my program fetches data most from RAM of SSD? Or what parameters I >>>>> should set in I/O meter to simulate my program I/O? >>>>> >>>>> Thank you again in advance for your time to help investigate it!! >>>>> >>>>> Nai Yan. >>>>> >>>>> 2012/2/11 Fabian Tillier <fa...@ti...> >>>>> >>>>> If I read the test correctly, all threads start at offset 0, and >>>>> then >>>>>> >>>>>> perform 512b reads with a 1024b stride between reads. As Joe >>>>>> said, this is pretty much sequential reading, and all threads are >>>>>> reading the same data, so most are likely to be satisifed from >>>>>> cache, either in the OS or on the SSD itself. They'll do >>>>>> 320000/16=20000 IO operations total each, so end up reading 20MB >>>>>> of the file. It's quite likely that the whole 20MB that you are >>>>>> reading will sit happilly in the file cache. >>>>>> >>>>>> Create an access pattern that mimics your app (512b sequential >>>>>> with 1024b stride), create 32 workers, and see if the results are similar. >>>>>> Best would be if you created a test file of 20MB, too. You can >>>>>> then see how things compare if you go with async I/O and a single thread. >>>>>> >>>>>> Cheers, >>>>>> -Fab >>>>>> >>>>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>>> > Forgive me if I missed it, but I don't see any randomization in >>>>>> > your file reads. >>>>>> > >>>>>> > It looks like you just skip ahead so thread 0 reads the first >>>>>> > 512bytes, thread 1 the next 512b. So any storage will be >>>>>> > prefetching very effectively. >>>>>> > >>>>>> > Tell Iometer to do sequential instead of random and see how much >>>>>> > closer the numbers are. Or better yet, make your program >>>>>> > randomize its reads over the entire disk. >>>>>> > >>>>>> > Joe >>>>>> > >>>>>> > >>>>>> > Quoting Nai yan zhao <zha...@gm...>: >>>>>> > >>>>>> >> Greetings, >>>>>> >> Could anybody help me a little out of my difficulty? >>>>>> >> >>>>>> >> I have a SSD and I am trying to use it to simulate my >>>>>> >> program I/O performance, however, IOPS calculated from my >>>>>> >> program is much much >>>>>> faster >>>>>> >> than IOMeter. >>>>>> >> >>>>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B >>>>>> >> random read IOPS is around 94k (queue depth is 32). >>>>>> >> However my program (32 windows threads) can reach around >>>>>> >> 500k >>>>>> 512B >>>>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but >>>>>> >> didn't >>>>>> find >>>>>> >> any error in data fetching. It's because my data fetching in order? >>>>>> >> >>>>>> >> I paste my code belwo (it mainly fetch 512B from file and >>>>>> >> release >>>>>> it; >>>>>> >> I did use 4bytes (an int) to validate program logic and didn't >>>>>> >> find problem), can anybody help me figure out where I am wrong? >>>>>> >> >>>>>> >> Thanks so much in advance!! >>>>>> >> >>>>>> >> Nai Yan. >>>>>> >> >>>>>> >> #include <stdio.h> >>>>>> >> #include <Windows.h> >>>>>> >> /* >>>>>> >> ** Purpose: Verify file random read IOPS in comparison with >>>>>> >> IOMeter >>>>>> >> ** Author: Nai Yan >>>>>> >> ** Date: Feb. 9th, 2012 >>>>>> >> **/ >>>>>> >> //Global variables >>>>>> >> long completeIOs = 0; >>>>>> >> long completeBytes = 0; >>>>>> >> int threadCount = 32; >>>>>> >> unsigned long long length = 1073741824; //4G test >>>>>> file >>>>>> >> int interval = 1024; >>>>>> >> int resultArrayLen = 320000; >>>>>> >> int *result = new int[resultArrayLen]; //Method declarison >>>>>> >> double GetSecs(void); //Calculate out duration >>>>>> >> int InitPool(long long,char*,int); //Initialize test data >>>>>> >> for >>>>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>>>> >> int * FileRead(char * path); >>>>>> >> unsigned int DataVerification(int*, int sampleItem); //Verify >>>>>> >> data fetched from pool int main() { int sampleItem = 0x1; char >>>>>> >> * fPath = "G:\\workspace\\4G.bin"; unsigned int invalidIO = 0; >>>>>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>>> >>>>>> >> printf("File write err... \n"); //start do random I/Os from >>>>>> >> initialized file double start = GetSecs(); int * fetchResult = >>>>>> >> FileRead(fPath); double end = GetSecs(); printf("File read >>>>>> >> IOPS is %.4f per second.. \n",completeIOs/(end - >>>>>> start)); >>>>>> >> //start data validation, for 4 bytes fetch only // invalidIO = >>>>>> >> DataVerification(fetchResult,**sampleItem); >>>>>> >>>>>> >> // if (invalidIO !=0) >>>>>> >> // { >>>>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); // >>>>>> >> } return 0; } >>>>>> >> >>>>>> >> >>>>>> >> int InitPool(long long length, char* path, int sample) { >>>>>> >> printf("Start initializing test data ... \n"); FILE * fp = >>>>>> >> fopen(path,"wb"); if (fp == NULL) { printf("file open err... >>>>>> >> \n"); exit (-1); } else //initialize file for testing { >>>>>> >> fseek(fp,0L,SEEK_SET); for (int i=0; i<length; i++) { >>>>>> >> fwrite(&sample,sizeof(int),1,**fp); >>>>>> >>>>>> >> } >>>>>> >> fclose(fp); >>>>>> >> fp = NULL; >>>>>> >> printf("Data initialization is complete...\n"); return 1; } } >>>>>> >> double GetSecs(void) { >>>>>> >> LARGE_INTEGER frequency; >>>>>> >> LARGE_INTEGER start; >>>>>> >> if(! QueryPerformanceFrequency(&**frequency)) >>>>>> >> printf("**QueryPerformanceFrequency Failed\n"); >>>>>> >> if(! QueryPerformanceCounter(&**start)) >>>>>> >> printf("**QueryPerformanceCounter Failed\n"); return >>>>>> >> ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>>> >>>>>> >> } >>>>>> >> class input >>>>>> >> { >>>>>> >> public: >>>>>> >> char *path; >>>>>> >> int starting; >>>>>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>>>>> >>>>>> >> }; >>>>>> >> //Workers >>>>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) { >>>>>> >> input * in = (input*) lpThreadParameter; >>>>>> >> char* path = in->path; >>>>>> >> FILE * fp = fopen(path,"rb"); >>>>>> >> int sPos = in->starting; >>>>>> >> // int * result = in->r; >>>>>> >> if(fp != NULL) >>>>>> >> { >>>>>> >> fpos_t pos; >>>>>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>>> >>>>>> >> { >>>>>> >> pos = i * interval; >>>>>> >> fsetpos(fp,&pos); >>>>>> >> //For 512 bytes fetch each time unsigned char *c =new unsigned >>>>>> >> char [512]; if (fread(c,512,1,fp) ==1) { >>>>>> >> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>> >> delete c; >>>>>> >> } >>>>>> >> //For 4 bytes fetch each time >>>>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) { >>>>>> >> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>> >> }*/ >>>>>> >> else >>>>>> >> { >>>>>> >> printf("file read err...\n"); >>>>>> >> exit(-1); >>>>>> >> } >>>>>> >> } >>>>>> >> fclose(fp); >>>>>> >> fp = NULL; >>>>>> >> } >>>>>> >> else >>>>>> >> { >>>>>> >> printf("File open err... \n"); >>>>>> >> exit(-1); >>>>>> >> } >>>>>> >> } >>>>>> >> int * FileRead(char * p) >>>>>> >> { >>>>>> >> printf("Starting reading file ... \n"); >>>>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>>>> >> completeIOs = 0; >>>>>> >> int slice = int (resultArrayLen/threadCount); for(int i = 0; i >>>>>> >> < threadCount; i++) { mWorkThread[i] = CreateThread( NULL, 0, >>>>>> >> FileReadThreadEntry, (LPVOID)(new input(i*slice,p)), 0, NULL); >>>>>> >> } >>>>>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, >>>>>> >> INFINITE); >>>>>> >>>>>> >> printf("File read complete... \n"); >>>>>> >> return result; >>>>>> >> } >>>>>> >> unsigned int DataVerification(int* result, int sampleItem) { >>>>>> >> unsigned int invalid = 0; for (int i=0; i< >>>>>> >> resultArrayLen/interval;i++) { if (result[i]!=sampleItem) { >>>>>> >> invalid ++; continue; } } return invalid; } >>>>>> >> >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>> ------------------------------**------------------------------** >>>>>> >>>>>> ------------------ >>>>>> > Virtualization & Cloud Management Using Capacity Planning Cloud >>>>>> > computing makes use of virtualization - but cloud computing also >>>>>> > focuses on allowing computing to be delivered as a service. >>>>>> > >>>>>> > http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.ac >>>>>> > celacomm.com/jaw/sfnl/114/51521223/> >>>>>> > ______________________________**_________________ >>>>>> > Iometer-devel mailing list >>>>>> > >>>>>> > Iometer-devel@lists.**sourceforge.net<Iom...@li...urce >>>>>> > forge.net> >>>>>> > >>>>>> > https://lists.sourceforge.net/**lists/listinfo/iometer-devel<htt >>>>>> > ps://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>>> >>>>>> >>>>> >>>> >>>> >>>> >>> >> >> >> > > ------------------------------------------------------------------------------ > Keep Your Developer Skills Current with LearnDevNow! > The most comprehensive online learning library for Microsoft developers is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, Metro Style Apps, more. Free future releases when you subscribe now! > http://p.sf.net/sfu/learndevnow-d2d > _______________________________________________ > Iometer-devel mailing list > Iom...@li... > https://lists.sourceforge.net/lists/listinfo/iometer-devel |
From: <Rob_Thomas@Dell.com> - 2012-02-15 16:11:18
|
You will need kernel AIO to match Windows numbers. There is a patch for it somewhere on here. I don’t know, maybe its included in 1.1? -----Original Message----- From: Nai yan zhao [mailto:zha...@gm...] Sent: Tuesday, February 14, 2012 8:33 PM To: Wesley Brown Cc: jo...@ei...; Iom...@li... Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? Hello Wesley, Thank you for your time to catch up this long mail list. I am not sure if "FILE_FLAG_WRITE_THROUGH" can really help read IO calculation, which is what I am concerning about. But I will try it later. Actually, after test file written to disk, even though I reboot my machine and rerun my program (this time, not write same test file to disk and just do read file operation), the test result reflected what windows threads reading is much faster than I/O meter. What I have been trying to understand where's the problem, either my program defect (specially for data fetching (maybe leverage cache in SSD or FS in a certain way during read), and IOPS calculation), either the wrong way I used I/O meter for read IO evaluation, or the huge difference in the read I/O operation manner between windows threads reading data and I/O meter. Per your experience, where's the problem at most possibility? I believe I/OMeter is mature enough and the problem should be on my side. You and other developers in this mail list are the right persons to help me out this difficulty!! Again, IOMeter I was using is 1.1 rc downloaded from sourceforge. Is that the problem? Maybe I need to try older version. Any suggestion? Thanks in advance!! Nai Yan. 2012/2/15 Wesley Brown <ad...@we...>: > Hello! > > Just catching up on the list emails. > > Since you are on a windows system and you are wanting to get the same kinds of numbers as ioMeter you really should be using WriteFile and set the flags to unbuffered using FILE_FLAG_WRITE_THROUGH. With fwrite is a buffered write in most cases on windows. You should also make sure your buffer is sector aligned with the disk in question. Most disks are 512 byte sectors but there are newer 4k sector size disks and SSDs hitting the market as we speak. Since you aren't looking to do overlapped async IO through fwrite then just setting the write through flag should be what you are after. > > http://msdn.microsoft.com/en-us/library/windows/desktop/aa365747(v=vs.85).aspx > > > > -----Original Message----- > From: Nai yan zhao [mailto:zha...@gm...] > Sent: Tuesday, February 14, 2012 12:10 PM > To: jo...@ei... > Cc: Iom...@li... > Subject: Re: [Iometer-devel] Please advise - Why IOPS by IOMeter is much slower than windows multi-threading data fetch IOPS? > > Hello Joe, > I just tried 256KB with 8 outstanding I/Os. It's ONLY 188.84MB. > From my official SSD benchmark - sequential read is 510MB/s (mine is PX-128M3S, http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1). > My IOMeter version is iometer-1.1.0-rc1-win32.i386-bin. > OS is windows 7 64bit Home Premium SP1. > Memory is 6GB, > CPU is Intel i5-2430M. > > I will send you snapshots via another email, since its size exceeds 64KB. > > If I didn't use IOMeter right? Again, would you help explain more about difference between I/O meter read I/O calculation and my program's? > > Please advise!! > > Thanks. > > Nai Yan. > > 2012/2/13 <jo...@ei...>: >> Manufacturer's quoted sequential MB/s won't be with 512byte reads. In >> Iometer, try 256KB sequential reads with about 8 outstanding I/Os. >> That should come closer to the maximum throughput(I doubt you'll be >> able to get your laptop to actually get close to 520MB/s though). >> >> I'll see if I can find a windows system to try to compile/run your >> program, but I can't make any promises. >> >> >> Joe >> >> >> Quoting Nai yan zhao <zha...@gm...>: >> >>> Hello Joe, >>> Thank you again for your time! >>> It's wired that from IOMeter, the throughput for sequential IOPS >>> (512B, queue depth is 64) is ONLY 42MB/s with around 82K IOPS. >>> However, from that SSD official website, this SSD sequential >>> throughput should be around 510MB/s ( >>> http://www.plextoramericas.com/index.php/ssd/px-m3-series?start=1, my >>> SSD is 128G). If there's any parameter I didn't set correctly in IOMeter? >>> >>> As you suggested, I try to create a 12GB sample file (my test bed >>> memory is 6GB and without RAID) and use 1 thread to do IO. The result >>> is 33666; However, with I/O meter, it's 11572 (throughput this time >>> is ONLY 5.93MB/s); IOPS still 3 times!! >>> >>> I attach my IOMeter settings, if there's anything wrong? Also, I >>> attach my modified code. Joe, could you help again to see where's >>> the problem? >>> >>> Thank you so much!! >>> >>> Nai Yan. >>> >>> 2012/2/13 <jo...@ei...> >>> >>>> 82K sounds reasonable for iops on an SSD. You should check the specs >>>> of your drive to see what you should expect. >>>> >>>> You need to remember that you are doing file i/o so you have several >>>> layers of cache involved. think of it was file cache -> block cache >>>> -> controller cache -> drive cache (you aren't testing a HW RAID, so >>>> you probably don't have cache in you controller) My personal run of >>>> thumb for random I/O is to have my file size be about 3x my combined cache size. >>>> For >>>> example, 4G ram in system, 512MB RAID cache, (8 drives*32MB) = >>>> 4.75GB I'd do a 16GB file. >>>> >>>> If in iometer you are accessing a PHYSICALDISK, then you are >>>> avoiding window's file cache. >>>> >>>> I just pulled up the code and (keep in mind I'm not much of a >>>> windows >>>> guy) >>>> something looks odd in your GetSecs routine. The cast to double is >>>> going to lose resolution, I think I would store the start/end times >>>> as LARGE_INTEGER. And you probably only have to call the frequency >>>> routine once >>>> >>>> Also windows used to have issues in the HAL where if a thread got >>>> moved to a different processor you'd get odd results. There is a >>>> Windows API call for setting affinity, similar to the linux >>>> sched_set_affinity. >>>> >>>> This doesn't really matter for what we are talking about, it is just >>>> a pet peeve of mine, your "delete c;" should be "delete [] c;" (are >>>> you intending tp be timing your allocator calls as well? you may be >>>> if you are simulating system performance, but typically for disk >>>> performance you'd try to preallocate as much as possible so your >>>> only timing the transfers) >>>> >>>> >>>> If it were me I would start with something simplier, (say single >>>> threaded sequential read) and see if your program gets the correct values then. >>>> You >>>> could also fire up windows performance monitor and try to correlate >>>> to its counts as well (PHYSICALDISK transfers/sec). >>>> >>>> Good Luck, >>>> >>>> Joe >>>> >>>> >>>> >>>> Quoting Nai yan zhao <zha...@gm...>: >>>> >>>> Hello Fabian and Joe, >>>>> >>>>> Thank you so much for your reply. >>>>> >>>>> Actually, what I am trying to do, is to split a file into 32 >>>>> parts, and each part will be assigned to a thread to read. Each >>>>> thread each time to open file, read 512B, and close file. I was >>>>> trying to avoid 2 read I/Os hit 1 block(512B) - i.e. to avoid cache >>>>> in SSD (it's 128MB), although most read I/Os are ordered but not >>>>> >>>>> contiguous<http://en.**wikipedia.org/wiki/Contiguity#**Computer_sci >>>>> ence<http://en.wikipedia.org/wiki/Contiguity#Computer_science> >>>>> >>>>> > >>>>> . >>>>> >>>>> >>>>> By your suggestion, I tried 512B sequential I/O with settings >>>>> below, >>>>> >>>>> Max disk size - 8388608 >>>>> # of Outstanding I/O - 32 (for 64, it's also around 82K) >>>>> Transfer request size - 512B, >>>>> 100% sequential >>>>> Reply size - no reply >>>>> Align I/Os on - Sector boundaries >>>>> >>>>> The result is around 82K, still much slower than my program. >>>>> >>>>> If my program has any defect in calculating IOPS? Or if I have >>>>> any misunderstanding of caching of SSD or file system, which causes >>>>> my program fetches data most from RAM of SSD? Or what parameters I >>>>> should set in I/O meter to simulate my program I/O? >>>>> >>>>> Thank you again in advance for your time to help investigate it!! >>>>> >>>>> Nai Yan. >>>>> >>>>> 2012/2/11 Fabian Tillier <fa...@ti...> >>>>> >>>>> If I read the test correctly, all threads start at offset 0, and >>>>> then >>>>>> >>>>>> perform 512b reads with a 1024b stride between reads. As Joe >>>>>> said, this is pretty much sequential reading, and all threads are >>>>>> reading the same data, so most are likely to be satisifed from >>>>>> cache, either in the OS or on the SSD itself. They'll do >>>>>> 320000/16=20000 IO operations total each, so end up reading 20MB >>>>>> of the file. It's quite likely that the whole 20MB that you are >>>>>> reading will sit happilly in the file cache. >>>>>> >>>>>> Create an access pattern that mimics your app (512b sequential >>>>>> with 1024b stride), create 32 workers, and see if the results are similar. >>>>>> Best would be if you created a test file of 20MB, too. You can >>>>>> then see how things compare if you go with async I/O and a single thread. >>>>>> >>>>>> Cheers, >>>>>> -Fab >>>>>> >>>>>> On Fri, Feb 10, 2012 at 5:40 AM, <jo...@ei...> wrote: >>>>>> > Forgive me if I missed it, but I don't see any randomization in >>>>>> > your file reads. >>>>>> > >>>>>> > It looks like you just skip ahead so thread 0 reads the first >>>>>> > 512bytes, thread 1 the next 512b. So any storage will be >>>>>> > prefetching very effectively. >>>>>> > >>>>>> > Tell Iometer to do sequential instead of random and see how much >>>>>> > closer the numbers are. Or better yet, make your program >>>>>> > randomize its reads over the entire disk. >>>>>> > >>>>>> > Joe >>>>>> > >>>>>> > >>>>>> > Quoting Nai yan zhao <zha...@gm...>: >>>>>> > >>>>>> >> Greetings, >>>>>> >> Could anybody help me a little out of my difficulty? >>>>>> >> >>>>>> >> I have a SSD and I am trying to use it to simulate my >>>>>> >> program I/O performance, however, IOPS calculated from my >>>>>> >> program is much much >>>>>> faster >>>>>> >> than IOMeter. >>>>>> >> >>>>>> >> My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B >>>>>> >> random read IOPS is around 94k (queue depth is 32). >>>>>> >> However my program (32 windows threads) can reach around >>>>>> >> 500k >>>>>> 512B >>>>>> >> IOPS, around 5 times of IOMeter!!! I did data validation but >>>>>> >> didn't >>>>>> find >>>>>> >> any error in data fetching. It's because my data fetching in order? >>>>>> >> >>>>>> >> I paste my code belwo (it mainly fetch 512B from file and >>>>>> >> release >>>>>> it; >>>>>> >> I did use 4bytes (an int) to validate program logic and didn't >>>>>> >> find problem), can anybody help me figure out where I am wrong? >>>>>> >> >>>>>> >> Thanks so much in advance!! >>>>>> >> >>>>>> >> Nai Yan. >>>>>> >> >>>>>> >> #include <stdio.h> >>>>>> >> #include <Windows.h> >>>>>> >> /* >>>>>> >> ** Purpose: Verify file random read IOPS in comparison with >>>>>> >> IOMeter >>>>>> >> ** Author: Nai Yan >>>>>> >> ** Date: Feb. 9th, 2012 >>>>>> >> **/ >>>>>> >> //Global variables >>>>>> >> long completeIOs = 0; >>>>>> >> long completeBytes = 0; >>>>>> >> int threadCount = 32; >>>>>> >> unsigned long long length = 1073741824; //4G test >>>>>> file >>>>>> >> int interval = 1024; >>>>>> >> int resultArrayLen = 320000; >>>>>> >> int *result = new int[resultArrayLen]; //Method declarison >>>>>> >> double GetSecs(void); //Calculate out duration >>>>>> >> int InitPool(long long,char*,int); //Initialize test data >>>>>> >> for >>>>>> >> testing, if successful, return 1; otherwise, return a non 1 value. >>>>>> >> int * FileRead(char * path); >>>>>> >> unsigned int DataVerification(int*, int sampleItem); //Verify >>>>>> >> data fetched from pool int main() { int sampleItem = 0x1; char >>>>>> >> * fPath = "G:\\workspace\\4G.bin"; unsigned int invalidIO = 0; >>>>>> >> if (InitPool(length,fPath,**sampleItem)!= 1) >>>>>> >>>>>> >> printf("File write err... \n"); //start do random I/Os from >>>>>> >> initialized file double start = GetSecs(); int * fetchResult = >>>>>> >> FileRead(fPath); double end = GetSecs(); printf("File read >>>>>> >> IOPS is %.4f per second.. \n",completeIOs/(end - >>>>>> start)); >>>>>> >> //start data validation, for 4 bytes fetch only // invalidIO = >>>>>> >> DataVerification(fetchResult,**sampleItem); >>>>>> >>>>>> >> // if (invalidIO !=0) >>>>>> >> // { >>>>>> >> // printf("Total invalid data fetch IOs are %d", invalidIO); // >>>>>> >> } return 0; } >>>>>> >> >>>>>> >> >>>>>> >> int InitPool(long long length, char* path, int sample) { >>>>>> >> printf("Start initializing test data ... \n"); FILE * fp = >>>>>> >> fopen(path,"wb"); if (fp == NULL) { printf("file open err... >>>>>> >> \n"); exit (-1); } else //initialize file for testing { >>>>>> >> fseek(fp,0L,SEEK_SET); for (int i=0; i<length; i++) { >>>>>> >> fwrite(&sample,sizeof(int),1,**fp); >>>>>> >>>>>> >> } >>>>>> >> fclose(fp); >>>>>> >> fp = NULL; >>>>>> >> printf("Data initialization is complete...\n"); return 1; } } >>>>>> >> double GetSecs(void) { >>>>>> >> LARGE_INTEGER frequency; >>>>>> >> LARGE_INTEGER start; >>>>>> >> if(! QueryPerformanceFrequency(&**frequency)) >>>>>> >> printf("**QueryPerformanceFrequency Failed\n"); >>>>>> >> if(! QueryPerformanceCounter(&**start)) >>>>>> >> printf("**QueryPerformanceCounter Failed\n"); return >>>>>> >> ((double)start.QuadPart/(**double)frequency.QuadPart); >>>>>> >>>>>> >> } >>>>>> >> class input >>>>>> >> { >>>>>> >> public: >>>>>> >> char *path; >>>>>> >> int starting; >>>>>> >> input (int st, char * filePath):starting(st),path(**filePath){} >>>>>> >>>>>> >> }; >>>>>> >> //Workers >>>>>> >> DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter) { >>>>>> >> input * in = (input*) lpThreadParameter; >>>>>> >> char* path = in->path; >>>>>> >> FILE * fp = fopen(path,"rb"); >>>>>> >> int sPos = in->starting; >>>>>> >> // int * result = in->r; >>>>>> >> if(fp != NULL) >>>>>> >> { >>>>>> >> fpos_t pos; >>>>>> >> for (int i=0; i<resultArrayLen/threadCount;**i++) >>>>>> >>>>>> >> { >>>>>> >> pos = i * interval; >>>>>> >> fsetpos(fp,&pos); >>>>>> >> //For 512 bytes fetch each time unsigned char *c =new unsigned >>>>>> >> char [512]; if (fread(c,512,1,fp) ==1) { >>>>>> >> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>> >> delete c; >>>>>> >> } >>>>>> >> //For 4 bytes fetch each time >>>>>> >> /*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1) { >>>>>> >> InterlockedIncrement(&**completeIOs); >>>>>> >>>>>> >> }*/ >>>>>> >> else >>>>>> >> { >>>>>> >> printf("file read err...\n"); >>>>>> >> exit(-1); >>>>>> >> } >>>>>> >> } >>>>>> >> fclose(fp); >>>>>> >> fp = NULL; >>>>>> >> } >>>>>> >> else >>>>>> >> { >>>>>> >> printf("File open err... \n"); >>>>>> >> exit(-1); >>>>>> >> } >>>>>> >> } >>>>>> >> int * FileRead(char * p) >>>>>> >> { >>>>>> >> printf("Starting reading file ... \n"); >>>>>> >> HANDLE mWorkThread[256]; //max 256 threads >>>>>> >> completeIOs = 0; >>>>>> >> int slice = int (resultArrayLen/threadCount); for(int i = 0; i >>>>>> >> < threadCount; i++) { mWorkThread[i] = CreateThread( NULL, 0, >>>>>> >> FileReadThreadEntry, (LPVOID)(new input(i*slice,p)), 0, NULL); >>>>>> >> } >>>>>> >> WaitForMultipleObjects(**threadCount, mWorkThread, TRUE, >>>>>> >> INFINITE); >>>>>> >>>>>> >> printf("File read complete... \n"); >>>>>> >> return result; >>>>>> >> } >>>>>> >> unsigned int DataVerification(int* result, int sampleItem) { >>>>>> >> unsigned int invalid = 0; for (int i=0; i< >>>>>> >> resultArrayLen/interval;i++) { if (result[i]!=sampleItem) { >>>>>> >> invalid ++; continue; } } return invalid; } >>>>>> >> >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>> ------------------------------**------------------------------** >>>>>> >>>>>> ------------------ >>>>>> > Virtualization & Cloud Management Using Capacity Planning Cloud >>>>>> > computing makes use of virtualization - but cloud computing also >>>>>> > focuses on allowing computing to be delivered as a service. >>>>>> > >>>>>> > http://www.accelacomm.com/jaw/**sfnl/114/51521223/<http://www.ac >>>>>> > celacomm.com/jaw/sfnl/114/51521223/> >>>>>> > ______________________________**_________________ >>>>>> > Iometer-devel mailing list >>>>>> > >>>>>> > Iometer-devel@lists.**sourceforge.net<Iom...@li...urce >>>>>> > forge.net> >>>>>> > >>>>>> > https://lists.sourceforge.net/**lists/listinfo/iometer-devel<htt >>>>>> > ps://lists.sourceforge.net/lists/listinfo/iometer-devel> >>>>>> >>>>>> >>>>> >>>> >>>> >>>> >>> >> >> >> > > ------------------------------------------------------------------------------ > Keep Your Developer Skills Current with LearnDevNow! > The most comprehensive online learning library for Microsoft developers is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, Metro Style Apps, more. Free future releases when you subscribe now! > http://p.sf.net/sfu/learndevnow-d2d > _______________________________________________ > Iometer-devel mailing list > Iom...@li... > https://lists.sourceforge.net/lists/listinfo/iometer-devel ------------------------------------------------------------------------------ Virtualization & Cloud Management Using Capacity Planning Cloud computing makes use of virtualization - but cloud computing also focuses on allowing computing to be delivered as a service. http://www.accelacomm.com/jaw/sfnl/114/51521223/ _______________________________________________ Iometer-devel mailing list Iom...@li... https://lists.sourceforge.net/lists/listinfo/iometer-devel |