The Intel® Parallel Studio XE 2015 Composer Editions for C++ Windows* and Linux* have a feature enhancement supporting data transfer for non-contiguous array elements with the Intel® Language Extensions for Offload (LEO) for the Intel® Xeon Phi™ coprocessor.
The feature adds support under the LEO offload data marshalling model for transferring non-contiguous array elements within an array variable reference (variable-ref) in the data transfer clauses (i.e. in, out, inout, nocopy) of the #pragma offload/offload_transfer statement.
Under the offload data marshalling model, each data transfer clause (in, out, inout, nocopy) shares a common basic syntax shown in the details below. The feature enhancement enables specifying a value for stride in the c-shape specification described below.
Syntax:
#pragma offloadclause [ clause …]
#pragma offload_transferclause [ clause …]
Where clause may include the data transfer clauses of:
in (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
out (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
inout (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
nocopy (variable-ref [, variable-ref …] [ modifier [ modifier … ] ])
And variable-ref is:
identifier
variable-ref , identifier
Use the following syntax for variable-ref
• variable-name: length ( number-of-elements)
variable-ref[c-shape]
Use the following syntax for variable-ref
• variable-name [ start :number-of-elements ] denotes contiguous
set of array elements
• variable-name[start : number-of-elements:stride] denotes either
contiguous or non-contiguous set of array elements
And modifier is:
Unchanged by the feature enhancement. Refer to the User and Reference Guide
for the Intel® C++ Compiler 15.0 for details.
The following example illustrates the use of the feature enhancement with non-unit stride with various data movement clauses and modifiers.
--------------------------------------------------------------------------
#include <stdio.h>#define ALLOC alloc_if(1)#define No_ALLOC alloc_if(0)#define FREE free_if(1)#define No_FREE free_if(0)#define REUSE alloc_if(0) free_if(0)__declspec( target (mic)) int *a, *b, *c, *d;__declspec( target (mic)) int n=16;__declspec( target (mic))void print_array(char *str,int * array,int start,int count){ printf(str); printf(" %d",array[start:count]); printf("\n"); fflush(0);}void print_header(char *str){ int i; printf(str); for (i=1;i < strlen(str); i++) printf("="); printf("\n"); fflush(0);}void initialize(){ int i; for (i = 0; i < n; i++) a[i]=i+2; b[0:n]=1; c[0:n]=a[0:n]; d[0:2*n]=0;}void IN_with_stride(){ int i; char msg[20]=""; print_header("Illustrate IN with non-unit stride\n"); print_array("host : a =",a,0,n); // Allocate space for a only, allocate/transfer b #pragma offload_transfer target(mic:0) mandatory \ nocopy(a : length(n) ALLOC No_FREE) \ in(b : length(n) ALLOC No_FREE) // Transfer 1/2 of the values of array a with non-unit stride #pragma offload target(mic:0) mandatory \ in(a[0:n/2:2] : REUSE ) \ nocopy(b : REUSE) { sprintf(msg,"-> mic%d : b (before) =",_Offload_get_device_number()); print_array(msg,b,0,n); for (i = 0; i < n; i++) b[i] = a[i]; sprintf(msg,"-> mic%d : a =",_Offload_get_device_number()); print_array(msg,a,0,n); sprintf(msg,"-> mic%d : b (after) =",_Offload_get_device_number()); print_array(msg,b,0,n); } // Free allocations #pragma offload_transfer target(mic:0) mandatory \ nocopy(a,b : No_ALLOC FREE) printf("\n");}void IN_with_ALLOC_with_stride(){ int i,l,cnt,s; char msg[20]=""; print_header("Illustrate IN with ALLOC with non-unit stride\n"); print_array("host : c =",c,0,n); l = 0; cnt = n/2; s = 2; // Allocate partial array and transfer non-unit stride elements // Ensure the number of elements transferred plus the stride // does not exceed the size of the partial allocation #pragma offload target(mic:0) mandatory \ in(c[l:(cnt/s)+(s%2):s] : alloc (c[l:cnt]) ALLOC FREE) { sprintf(msg,"-> mic%d : c =",_Offload_get_device_number()); print_array(msg,c,l,cnt); } printf("\n");}void INTO_with_stride(){ int i,l,cnt,s; char msg[20]=""; print_header("Illustrate INTO with non-unit stride\n"); c[0:n]=a[0:n]; print_array("host : a =",a,0,n); print_array("host : c (before) =",c,0,n); print_array("host : d[0:n] =",d,0,n); print_array("host : d[n:n] =",d,n,n); l = n/2; cnt = n/2; s = 2; // Allocate d only #pragma offload_transfer target(mic:0) mandatory \ nocopy(d : length(2*n) ALLOC No_FREE) // Transfer a elements with non-unit stride into d on coprocessor only #pragma offload target(mic:0) mandatory \ in(a[0:cnt:s] : into (d[n:cnt:s]) REUSE) { sprintf(msg,"-> mic%d : d[0:n] =",_Offload_get_device_number()); print_array(msg,d,0,n); sprintf(msg,"-> mic%d : d[n:n] =",_Offload_get_device_number()); print_array(msg,d,n,n); } // Transfer d elements with non-unit stride into c on host only // Free the allocation #pragma offload_transfer target(mic:0) mandatory \ out(d[n:cnt:s] : into (c[1:cnt:s]) No_ALLOC FREE) print_array("host : c (after) =",c,0,n); printf("\n");}int main(int argc, char* argv[]){ a = (int *) _mm_malloc(n*sizeof(int), 64); b = (int *) _mm_malloc(n*sizeof(int), 64); c = (int *) _mm_malloc(n*sizeof(int), 64); d = (int *) _mm_malloc((2*n)*sizeof(int), 64); initialize(); IN_with_stride(); IN_with_ALLOC_with_stride(); INTO_with_stride();}